From a6cfe91b4a5e283ad846cec08c4f6856823ce4d7 Mon Sep 17 00:00:00 2001
From: Ryan Routsong <routsongrm@nih.gov>
Date: Thu, 27 Jun 2024 12:27:59 -0400
Subject: [PATCH 01/28] chore: refactor rule all, move some files to more
 appropriate locations

---
 .../scripts/createtable => bin/creattable.py  |   0
 .../filterMetrics => bin/filterMetrics.py     |   8 +-
 {workflow/scripts => bin}/jobby               |   0
 {workflow/scripts => bin}/ppqt/LICENSE        |   0
 {workflow/scripts => bin}/ppqt/README.txt     |   0
 .../ppqt/peakCallingPipelineForIdr.txt        |   0
 {workflow/scripts => bin}/ppqt/run_spp.R      |   0
 .../scripts => bin}/ppqt/run_spp_nodups.R     |   0
 .../scripts => bin}/ppqt/spp_1.10.1.tar.gz    | Bin
 workflow/Snakefile                            | 402 +++++++-----------
 workflow/scripts/grouping.py                  |  59 +++
 11 files changed, 206 insertions(+), 263 deletions(-)
 rename workflow/scripts/createtable => bin/creattable.py (100%)
 rename workflow/scripts/filterMetrics => bin/filterMetrics.py (93%)
 rename {workflow/scripts => bin}/jobby (100%)
 rename {workflow/scripts => bin}/ppqt/LICENSE (100%)
 rename {workflow/scripts => bin}/ppqt/README.txt (100%)
 rename {workflow/scripts => bin}/ppqt/peakCallingPipelineForIdr.txt (100%)
 rename {workflow/scripts => bin}/ppqt/run_spp.R (100%)
 rename {workflow/scripts => bin}/ppqt/run_spp_nodups.R (100%)
 rename {workflow/scripts => bin}/ppqt/spp_1.10.1.tar.gz (100%)
 create mode 100644 workflow/scripts/grouping.py

diff --git a/workflow/scripts/createtable b/bin/creattable.py
similarity index 100%
rename from workflow/scripts/createtable
rename to bin/creattable.py
diff --git a/workflow/scripts/filterMetrics b/bin/filterMetrics.py
similarity index 93%
rename from workflow/scripts/filterMetrics
rename to bin/filterMetrics.py
index 1d56ade..6562b71 100755
--- a/workflow/scripts/filterMetrics
+++ b/bin/filterMetrics.py
@@ -60,9 +60,9 @@ def getmetadata(type):
 	elif type == 'tnreads':
 		metadata = 'NReads'
 	elif type == 'mnreads':
-                metadata = 'NMappedReads'
+		metadata = 'NMappedReads'
 	elif type == 'unreads':
-                metadata = 'NUniqMappedReads'
+		metadata = 'NUniqMappedReads'
 	elif type == 'fragLen':
 		metadata = 'FragmentLength'
 	return metadata
@@ -88,11 +88,11 @@ def filteredData(sample, ftype):
 			extenders = []
 			for ppqt_value in linelist:
 				if int(ppqt_value) > 150:
-                            		extenders.append(ppqt_value)
+					extenders.append(ppqt_value)
 			if len(extenders) > 0:
 				print("{}\t{}\t{}".format(sample, mtypes, extenders[0]))
 			else:
-                        	print("{}\t{}\t{}".format(sample, mtypes, linelist[0]))
+				print("{}\t{}\t{}".format(sample, mtypes, linelist[0]))
 		elif ftype == 'ppqt' or ftype == 'ngsqc' or ftype == 'nrf':
 			mtypes = getmetadata(ftype)
 			for i in range(len(linelist)):
diff --git a/workflow/scripts/jobby b/bin/jobby
similarity index 100%
rename from workflow/scripts/jobby
rename to bin/jobby
diff --git a/workflow/scripts/ppqt/LICENSE b/bin/ppqt/LICENSE
similarity index 100%
rename from workflow/scripts/ppqt/LICENSE
rename to bin/ppqt/LICENSE
diff --git a/workflow/scripts/ppqt/README.txt b/bin/ppqt/README.txt
similarity index 100%
rename from workflow/scripts/ppqt/README.txt
rename to bin/ppqt/README.txt
diff --git a/workflow/scripts/ppqt/peakCallingPipelineForIdr.txt b/bin/ppqt/peakCallingPipelineForIdr.txt
similarity index 100%
rename from workflow/scripts/ppqt/peakCallingPipelineForIdr.txt
rename to bin/ppqt/peakCallingPipelineForIdr.txt
diff --git a/workflow/scripts/ppqt/run_spp.R b/bin/ppqt/run_spp.R
similarity index 100%
rename from workflow/scripts/ppqt/run_spp.R
rename to bin/ppqt/run_spp.R
diff --git a/workflow/scripts/ppqt/run_spp_nodups.R b/bin/ppqt/run_spp_nodups.R
similarity index 100%
rename from workflow/scripts/ppqt/run_spp_nodups.R
rename to bin/ppqt/run_spp_nodups.R
diff --git a/workflow/scripts/ppqt/spp_1.10.1.tar.gz b/bin/ppqt/spp_1.10.1.tar.gz
similarity index 100%
rename from workflow/scripts/ppqt/spp_1.10.1.tar.gz
rename to bin/ppqt/spp_1.10.1.tar.gz
diff --git a/workflow/Snakefile b/workflow/Snakefile
index 777e616..6a92c24 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -15,272 +15,156 @@ from scripts.common import (
     references,
     str_bool
 )
+from scripts.grouping import group_samples_by_reps, group_output_files
 
-# Timestamp in YYYYMMDD format
-today = str(datetime.datetime.today()).split()[0].replace('-', '')
-
-# Global workflow variables
 configfile: "config.json"
-samples  = config['samples']
-workpath = config['project']['workpath']
-tmpdir = config['options']['tmp_dir']
-genome   = config['options']['genome']         # Reference genome of a set of samples
-assay   = config['options']['assay']
-blocks = config['project']['blocks']
-
 
-if None in list(blocks.values()):
-    blocking = False
-else:
-    blocking = True
-
-# Check for SE or PE FastQ files:
-convert = {1: False, 2: True}                     # 1 = SE, 2 = PE, -1 = Unknown
-try:
-    paired_end = convert[config['project']['nends']]  # True if PE else false
-except KeyError:
-    # Catching case when value is -1 or unknown
-    sys.exit("Fatal: Raw data could not be classified as single-end or paired-end data!")
-    
-# Analysis options
-# Run differential binding pipeline
-run_dba = True 
-if config['options']['contrasts'] == 'None':
-    run_dba = False
+# Global workflow variables
+today                           = str(datetime.datetime.today()).split()[0].replace('-', '') # YYYYMMDD
+samples                         = config['samples']
+workpath                        = config['project']['workpath']
+tmpdir                          = config['options']['tmp_dir']
+genome                          = config['options']['genome']
+assay                           = config['options']['assay']
+blocks                          = config['project']['blocks']
+blocking                        = False if None in list(blocks.values()) else True
+convert                         = {1: False, 2: True}  # 1 = SE, 2 = PE, -1 = Unknown
+paired_end                      = convert[config['project']['nends']]  # True if PE else false
+run_dba                         = False config['options']['contrasts'] is None else True
+extensions                      = ["sorted.RPGC", "Q5DD.RPGC"]
+chips                           = config['project']['peaks']['chips']
+contrast                        = config['project']['contrast']
+UropaCats                       = ["protTSS", "prot", "protSEC", "genes"]
+zipGroup1, zipGroup2, zipToolC, contrasts \
+                                = zip_contrasts(contrast, PeakTools)
+extensionsDict                  = {"sorted": "bam", "Q5DD":"bam"} if paired_end \
+                                    else {"sorted": "bam", "Q5DD_tagAlign": "gz"}
+file_exts                       = list(extensionsDict.keys())
+extensionsFull                  = ['sorted.bam', 'Q5DD.bam'] if paired_end \
+                                    else ['sorted.bam', 'Q5DD_tagAlign.gz']
+
+# Directory end points
+trim_dir                        = "trim"
+kraken_dir                      = "kraken"
+bam_dir                         = join(workpath, "bam")
+bw_dir                          = join(workpath, "bigwig")
+qc_dir                          = join(workpath, "QC")
+ppqt_dir                        = join(bam_dir, "ppqt")
+macsN_dir                       = join(workpath, "macsNarrow")
+macsB_dir                       = join(workpath, "macsBroad")
+sicer_dir                       = join(workpath, "sicer")
+peakqc_dir                      = join(workpath, "PeakQC")
+uropa_dir                       = join(workpath, "UROPA_annotations")
+diffbind_dir                    = join(uropa_dir, "DiffBind")
+cfTool_dir                      = join(workpath, "cfChIPtool")
+genrich_dir                     = join(workpath, "Genrich")
+MEME_dir                        = join(workpath, "MEME")
+
+# Extended data structures
+'''
+:param chip2input <dict>: map (1:1) from sample id to input
+{
+    "WT_S1": "Input_S1",
+    "WT_S2": "Input_S2",
+    "WT_S3": "Input_S3",
+    "WT_S4": "Input_S4"
+    ...
+}
+'''
+chip2input                      = config['project']['peaks']['inputs']
+
+'''
+:param groupdata <dict>: map (1:M) of group id to sample ids
+{
+    "G1": ["WT_S1", "WT_S2"],
+    "G2": ["WT_S3", "WT_S4"]
+    ...
+}
+'''
+groupdata                       = config['project']['groups']
+
+'''
+:param groupdatawinput <dict>: 
+{
+    "G1": ["WT_S1", "WT_S2"],
+    "G2": ["WT_S3", "WT_S4"]
+    ...
+}
+
+:param groupswreps <list>:
+    ["G1", "G2", ...]
+'''
+groupdatawinput, groupswreps    = group_samples_by_reps(groupdata, samples, chip2input)
+groups                          = list(groupdatawinput.keys())
+reps                            = False if len(groupswreps) > 0 else True
+uniq_inputs                     = list(sorted(set([v for v in chip2input.values() if v])))
+sampleswinput                   = [
+    chip_value for input_id, chip_value in chip2input.items() \
+    if chip_value != 'NA' and chip_value != ''
+]
+inputnorm                       = [""] if len(sampleswinput) == 0 else ["", ".inputnorm"]
+deepgroups, deepexts            = group_output_files(extensions, groups,inputnorm)
 
 # Read in resource information,
 # containing information about 
 # threads, mem, walltimes, etc.
-# TODO: Add handler for when the
-# mode is set to local.
 with open(join('config', 'cluster.json')) as fh:
     cluster = json.load(fh)
 
-# Functions
-def outputfiles2(extensions, groupslist, inputnorm):
-    """
-    Produces correct output filenames based on group information.
-    Names will be:
-    Inputnorm.Q5DD.RPGC.metagene_heatmap.pdf
-    {groupName}.Q5DD.RPGC.metagene_heatmap.pdf
-    {groupName}.sorted.RPGC.metagene_heatmap.pdf
-    Note: Inputnorm will only be included when there are input samples.
-    """
-    dtoolgroups, dtoolext = [], []
-    
-    if len(inputnorm) == 2:
-            dtoolgroups.extend(["InputNorm"])
-            dtoolext.extend([extensions[1]])
-    
-    for group in groupslist:
-            dtoolgroups.extend([group] * 2)
-            dtoolext.extend([extensions[1], extensions[0]])
-    
-    if len(inputnorm) == 2:
-            dtoolgroups.extend(["InputNorm.prot"])
-            dtoolext.extend([extensions[1]])
-    
-    for group in groupslist:
-            dtoolgroups.extend([group + ".prot"] * 2)
-            dtoolext.extend([extensions[1], extensions[0]])
-    
-    return dtoolgroups, dtoolext
-
-def zip_contrasts(contrast, PeakTools):
-    """making output file names for differential binding analyses"""
-    zipGroup1, zipGroup2, zipTool, contrasts = [], [], [], []
-    for g1, g2 in contrast:
-        for PeakTool in PeakTools:
-            zipGroup1.append(g1)
-            zipGroup2.append(g2)
-            zipTool.append(PeakTool)
-            contrasts.append( g1 + "_vs_" + g2 + "-" + PeakTool )
-    return(zipGroup1, zipGroup2, zipTool, contrasts)
-
-
-extensions = [ "sorted.RPGC", "Q5DD.RPGC" ]
-
-
-# Getting sample relationships from config
-# using ChIP/input nomenclature. NOTE: ATAC
-# won't have input samples
-
-###########
-chip2input = config['project']['peaks']['inputs'] #{"WT_S1": "Input_S1","WT_S2": "Input_S2","WT_S3": "Input_S3","WT_S4": "Input_S4"}
-groupdata = config['project']['groups'] # {"G1": ["WT_S1","WT_S2"],"G2": ["WT_S3","WT_S4"]}
-
-groupdatawinput = {}
-groupswreps = []
-for group, chipsamples in groupdata.items() :
-    tmp = [ ]
-    if len(chipsamples) > 1:
-        groupswreps.append(group)
-    for chip in chipsamples :
-        if chip in samples:
-            tmp.append(chip)
-            input = chip2input[chip]
-            if input != 'NA' and input != '':
-                tmp.append(input)
-    if len(tmp) != 0:
-        groupdatawinput[group]=set(tmp)
-
-groups = list(groupdatawinput.keys())
-
-reps="no"
-if len(groupswreps) > 0:
-    reps="yes"
-##############
-
-uniq_inputs = list(sorted(set([v for v in chip2input.values() if v])))
-
-sampleswinput = []
-for input in chip2input:
-	if chip2input[input] != 'NA' and chip2input[input] != '':
-		sampleswinput.append(input)
-
-
-if len(sampleswinput) == 0:
-    inputnorm = [""]
-else:
-    inputnorm = ["",".inputnorm"]
-
-
-deepgroups, deepexts = outputfiles2(extensions, groups,inputnorm)
-
-
-
-# Directory names
-trim_dir='trim'
-kraken_dir='kraken'
-bam_dir='bam'
-bw_dir='bigwig'
-deeptools_dir='deeptools'
-extra_fingerprint_dir='deeptools/sorted_fingerprint'
-qc_dir="QC"
-ppqt_dir="ppqt"
-macsN_dir="macsNarrow"
-macsB_dir="macsBroad"
-sicer_dir="sicer"
-
-uropa_dir = "UROPA_annotations"
-diffbind_dir = "DiffBind"
-diffbind_dir_block = "DiffBindBlock"
-
-if assay == "atac": 
-    PeakTools = ["macsNarrow", "Genrich"] 
-elif assay == "chip":
-    PeakTools = ["macsNarrow", "macsBroad", "sicer"]
-else: 
-    PeakTools = ["macsNarrow"]
-
-chips = config['project']['peaks']['chips']
-contrast = config['project']['contrast']
-UropaCats = ["protTSS", "prot", "protSEC", "genes"]
-extensions = ["sorted.RPGC", "Q5DD.RPGC"]
-
-# Setup to run with ChIP samples, 
-# which could include IgG samples
-cfTool_dir="cfChIPtool"
-cfTool_subdir2="cfChIPtool/BED/H3K4me3"
-
-zipGroup1, zipGroup2, zipToolC, contrasts = zip_contrasts(contrast, PeakTools)
-# Final targets of the pipeline
- 
-if paired_end:
-    extensionsDict = {"sorted": "bam", "Q5DD":"bam"}
-    extensionsFull = ['sorted.bam', 'Q5DD.bam']
-else:
-    extensionsDict= {"sorted": "bam", "Q5DD_tagAlign": "gz"}
-    extensionsFull = ['sorted.bam', 'Q5DD_tagAlign.gz']
-
-if assay == "cfchip":
-    rule all:
-        input: 
-            join(workpath,"multiqc_report.html"),
-            expand(join(workpath,qc_dir,"{name}.{ext}.insert_size_metrics.txt"),name=samples,ext=list(extensionsDict.keys())),
-            expand(join(workpath,bam_dir,"{name}.{ext}"),name=samples,ext=extensionsFull),
-            expand(join(workpath,qc_dir,"{name}.preseq.dat"), name=samples),
-            expand(join(workpath,macsN_dir,"{name}","{name}_peaks.narrowPeak"),name=chips),
-            expand(join(workpath,"PeakQC","{PeakTool}.{name}.Q5DD.FRiP_table.txt"), PeakTool=PeakTools, name=samples),
-            expand(join(workpath,cfTool_dir,"Output","H3K4me3","Signatures","{name}.Q5DD.csv"),name=chips),
-            join(workpath,"QC","H3K4me3_cfChIP_signature.txt"),
-            expand(join(workpath,bw_dir,"{name}.{ext}.RPGC.bw"),name=samples, ext=["sorted", "Q5DD"]),
-            expand(join(workpath,bw_dir,"{name}.Q5DD.RPGC.inputnorm.bw"), name=sampleswinput),
-            expand(join(workpath,uropa_dir,'{PeakTool}','{name}_{PeakTool}_uropa_{type}_allhits.txt'),
-                PeakTool=PeakTools,name=chips,type=["protTSS"]),
-            expand(join(workpath, "QC", "AllSamples-{PeakTool}", "AllSamples-{PeakTool}_DiffBindQC_TMMcounts.bed"), PeakTool=PeakTools),
-	    expand(join(workpath, uropa_dir, "QC", "AllSamples-macsNarrow_{PeakTool}_uropa_{type}_allhits.txt"),
-                PeakTool="DiffBindQC", type="protTSS"),
-            expand(join(workpath,uropa_dir,"promoterTable1",'{PeakTool}_promoter_overlap_summaryTable.txt'),PeakTool=PeakTools),
-            provided(expand(join(workpath,diffbind_dir,"{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind.html"), 
-                zip,group1=zipGroup1,group2=zipGroup2,PeakTool=zipToolC), reps == "yes"),
-            provided(expand(join(workpath,uropa_dir,diffbind_dir,'{name}_{PeakTool}_uropa_{type}_allhits.txt'), 
-                PeakTool=['DiffbindEdgeR','DiffbindDeseq2'],name=contrasts,type=["protTSS"]), reps == "yes"),
-            provided(expand(join(workpath,uropa_dir,"promoterTable2",'DiffbindDeseq2_{PeakTool}_promoter_overlap_summaryTable.txt'), 
-                PeakTool=PeakTools),reps == "yes" and contrast),
-
-elif assay in ["atac", "chip"]:
-    rule all:
-        input: 
-            join(workpath,"multiqc_report.html"),
-            provided(expand(join(workpath,qc_dir,"{name}.{ext}.insert_size_metrics.txt"),name=samples,ext=list(extensionsDict.keys())), paired_end==True),
-            expand(join(workpath,bam_dir,"{name}.{ext}"),name=samples,ext=extensionsFull),
-            expand(join(workpath,qc_dir,"{name}.preseq.dat"), name=samples),
-            expand(join(workpath,macsN_dir,"{name}","{name}_peaks.narrowPeak"),name=chips),
-            provided(expand(join(workpath,"macsBroad","{name}","{name}_peaks.broadPeak"),name=chips), assay=="chip"),
-            provided(expand(join(workpath,"sicer","{name}","{name}_broadpeaks.bed"),name=chips), assay=="chip"),
-            expand(join(workpath,"PeakQC","{PeakTool}.{name}.Q5DD.FRiP_table.txt"), PeakTool=PeakTools, name=samples),
-            expand(join(workpath,bw_dir,"{name}.{ext}.RPGC.bw"),name=samples, ext=["sorted", "Q5DD"]),
-            expand(join(workpath,bw_dir,"{name}.Q5DD.RPGC.inputnorm.bw"), name=sampleswinput),
-            expand(join(workpath,uropa_dir,'{PeakTool}','{name}_{PeakTool}_uropa_{type}_allhits.txt'),
-                PeakTool=PeakTools,name=chips,type=["protTSS", "prot", "protSEC", "genes"]),
-            
-            provided(expand(join(workpath,diffbind_dir,"{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind.html"), 
-                zip,group1=zipGroup1,group2=zipGroup2,PeakTool=zipToolC), reps == "yes"),
-            provided(expand(join(workpath,uropa_dir,'{PeakTool}','{name}_{PeakTool}_uropa_{type}_allhits.txt'),
-                PeakTool=PeakTools,name=chips,type=["protTSS", "prot", "protSEC", "genes"]), reps == "yes"),
-            provided(expand(join(workpath,uropa_dir,diffbind_dir,'{name}_{PeakTool}_uropa_{type}_allhits.txt'), 
-                PeakTool=['DiffbindEdgeR','DiffbindDeseq2'],name=contrasts,type=["protTSS", "prot", "protSEC", "genes"]), reps == "yes"),
-
-            provided(expand(join(workpath,"Genrich","{name}","{name}.narrowPeak"),name=chips), assay=="atac"),
-
-            provided(expand(join(workpath,bam_dir,ppqt_dir,"{name}.{ext}.ppqt"), name=samples, ext=["sorted", "Q5DD"]), paired_end == True and assay=="chip"),
-            provided(expand(join(workpath,bam_dir,ppqt_dir,"{name}.{ext}.pdf"), name=samples, ext=["sorted", "Q5DD"]), paired_end == True and assay=="chip"),
-            provided(expand(join(workpath,bam_dir,ppqt_dir,"{name}.{ext}.ppqt.txt"),name=samples, ext=["sorted", "Q5DD"]), paired_end == True and assay=="chip"),
-            provided(expand(join(workpath,bam_dir,ppqt_dir,"{name}.{ext}.ppqt"), name=samples, ext=["sorted", "Q5DD_tagAlign"]), paired_end == False and assay=="chip"),
-            provided(expand(join(workpath,bam_dir,ppqt_dir,"{name}.{ext}.pdf"), name=samples, ext=["sorted", "Q5DD_tagAlign"]), paired_end == False and assay=="chip"),
-            provided(expand(join(workpath,bam_dir,ppqt_dir,"{name}.{ext}.ppqt.txt"),name=samples, ext=["sorted", "Q5DD_tagAlign"]), paired_end == False and assay=="chip"),
-            expand(join(workpath, "MEME", "{PeakTool}", "{name}_meme", "meme-chip.html"), PeakTool=PeakTools, name=chips),
-            expand(join(workpath, "MEME", "{PeakTool}", "{name}_ame", "ame.html"), PeakTool=PeakTools, name=chips)
-
-            
-#############################
-# Pipeline hooks for Onstart,
-# onsucess, and onerror
-include: join("rules", "hooks.smk")
-
-# QC/alignment rules: trim_pe,
-# BWA_PE, picard_dedup, bam2bw,
-# inputnorm
-include: "rules/trim_align_dedup.smk"
-
-# QC rules common to all: preseq, NRF,
-# rawfastqc, fastqc, fastq_screen, 
-# kraken_pe, multiqc, insert_size
-include: "rules/qc.smk"
-
-# MACS2_narrow
-# if assay=="atac" then run rules sortByRead and genrich
-include: "rules/peakcall.smk"
-
-#FRiP, FRiP_plot, jaccard rules
-include: "rules/peakcall_qc.smk"
-
-#UROPA, DiffBind, and manorm rules
-include: "rules/dba.smk"
-
-# cfChIP-specific QC rules:
-# cfChIPtool, cfChIPcompile,
-# promoterTable1:, and promoterTable2
-if assay == "cfchip":
-    include: "rules/cfChIP.smk"
+rule all:
+    input:
+        if assay == "cfchip":
+            peak_types = ["protTSS"]
+            join(qc_dir, "H3K4me3_cfChIP_signature.txt"),
+            expand(join(qc_dir, "{name}.{ext}.insert_size_metrics.txt"), name=samples, ext=file_exts)
+            expand(join(cfTool_dir, "Output", "H3K4me3", "Signatures", "{name}.Q5DD.csv"), name=chips),
+            expand(join(uropa_dir, "{PeakTool}", "{name}_{PeakTool}_uropa_{_type}_allhits.txt"),
+                PeakTool=PeakTools, name=chips, _type=peak_types),
+            expand(join(uropa_dir, "QC", "AllSamples-macsNarrow_{PeakTool}_uropa_{_type}_allhits.txt"),
+                PeakTool="DiffBindQC", _type=peak_types),
+            expand(join(uropa_dir, "promoterTable1", "{PeakTool}_promoter_overlap_summaryTable.txt"), PeakTool=PeakTools),
+            expand(join(diffbind_dir, "{name}_{PeakTool}_uropa_{_type}_allhits.txt"),
+                PeakTool=['DiffbindEdgeR','DiffbindDeseq2'], name=contrasts, _type=peak_types)
+            if reps:
+                expand(join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind.html"), 
+                    group1=zipGroup1, group2=zipGroup2, PeakTool=zipToolC)
+                expand(join(diffbind_dir, "{name}_{PeakTool}_uropa_{type}_allhits.txt"), 
+                    PeakTool=['DiffbindEdgeR','DiffbindDeseq2'], name=contrasts, _type=peak_types)
+
+        else assay in ["atac", "chip"]:
+            peak_types = ["protTSS", "prot", "protSEC", "genes"]
+            expand(join(MEME_dir, "{PeakTool}", "{name}_meme", "meme-chip.html"), PeakTool=PeakTools, name=chips),
+            expand(join(MEME_dir, "{PeakTool}", "{name}_ame", "ame.html"), PeakTool=PeakTools, name=chips)
+            if paired_end:
+                expand(join(qc_dir, "{name}.{ext}.insert_size_metrics.txt"), name=samples, ext=file_exts)
+            if assay == "chip":
+                expand(join(macsB_dir, "{name}", "{name}_peaks.broadPeak"), name=chips)
+                expand(join(sicer_dir, "{name}", "{name}_broadpeaks.bed"), name=chips)
+                if paired_end:
+                    short_ext, tag_ext = ["sorted", "Q5DD"], ["sorted", "Q5DD_tagAlign"]
+                    expand(join(ppqt_dir, "{name}.{ext}.ppqt"), name=samples, ext=short_ext),
+                    expand(join(ppqt_dir, "{name}.{ext}.pdf"), name=samples, ext=short_ext),
+                    expand(join(ppqt_dir, "{name}.{ext}.ppqt.txt"), name=samples, ext=short_ext)
+                    expand(join(ppqt_dir, "{name}.{ext}.ppqt"), name=samples, ext=tag_ext)
+                    expand(join(ppqt_dir, "{name}.{ext}.pdf"), name=samples, ext=tag_ext)
+                    expand(join(ppqt_dir, "{name}.{ext}.ppqt.txt"), name=samples, ext=tag_ext)
+            if assay == "atac":
+                expand(join(genrich_dir, "{name}", "{name}.narrowPeak"), name=chips)
+            if reps:
+                expand(join(uropa_dir, "{PeakTool}", "{name}_{PeakTool}_uropa_{type}_allhits.txt"),
+                    PeakTool=PeakTools, name=chips, _type=peak_types),
+                expand(join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind.html"), 
+                    group1=zipGroup1, group2=zipGroup2, PeakTool=zipToolC)
+                expand(join(diffbind_dir, "{name}_{PeakTool}_uropa_{type}_allhits.txt"), 
+                    PeakTool=["DiffbindEdgeR", "DiffbindDeseq2"], name=contrasts, _type=peak_types)
+                if contrast:
+                    expand(join(uropa_dir, "promoterTable2", "DiffbindDeseq2_{PeakTool}_promoter_overlap_summaryTable.txt"), 
+                        PeakTool=PeakTools)
+        join(workpath,"multiqc_report.html"),
+        expand(join(qc_dir, "{name}.preseq.dat"), name=samples),
+        expand(join(qc_dir, "AllSamples-{PeakTool}", "AllSamples-{PeakTool}_DiffBindQC_TMMcounts.bed"), PeakTool=PeakTools),
+        expand(join(bam_dir, "{name}.{ext}"), name=samples, ext=extensionsFull),
+        expand(join(macsN_dir, "{name}","{name}_peaks.narrowPeak"), name=chips),
+        expand(join(peakqc_dir, "{PeakTool}.{name}.Q5DD.FRiP_table.txt"), PeakTool=PeakTools, name=samples),
+        expand(join(bw_dir, "{name}.{ext}.RPGC.bw"), name=samples, ext=["sorted", "Q5DD"]),
+        expand(join(bw_dir, "{name}.Q5DD.RPGC.inputnorm.bw"), name=sampleswinput),
\ No newline at end of file
diff --git a/workflow/scripts/grouping.py b/workflow/scripts/grouping.py
new file mode 100644
index 0000000..6e39613
--- /dev/null
+++ b/workflow/scripts/grouping.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python3
+# common functions related to sample grouping or group meta-information
+def group_samples_by_reps(groupdata, samples, chip2input):
+    groupdatawinput = {}
+    groupswreps = []
+    for group, chipsamples in groupdata.items() :
+        tmp = [ ]
+        if len(chipsamples) > 1:
+            groupswreps.append(group)
+        for chip in chipsamples :
+            if chip in samples:
+                tmp.append(chip)
+                input = chip2input[chip]
+                if input != 'NA' and input != '':
+                    tmp.append(input)
+        if len(tmp) != 0:
+            groupdatawinput[group]=set(tmp)
+    return groupdatawinput, groupswreps
+
+
+def group_output_files(extensions, groupslist, inputnorm):
+    """
+    Produces correct output filenames based on group information.
+    Names will be:
+    Inputnorm.Q5DD.RPGC.metagene_heatmap.pdf
+    {groupName}.Q5DD.RPGC.metagene_heatmap.pdf
+    {groupName}.sorted.RPGC.metagene_heatmap.pdf
+    Note: Inputnorm will only be included when there are input samples.
+    """
+    dtoolgroups, dtoolext = [], []
+    
+    if len(inputnorm) == 2:
+            dtoolgroups.extend(["InputNorm"])
+            dtoolext.extend([extensions[1]])
+    
+    for group in groupslist:
+            dtoolgroups.extend([group] * 2)
+            dtoolext.extend([extensions[1], extensions[0]])
+    
+    if len(inputnorm) == 2:
+            dtoolgroups.extend(["InputNorm.prot"])
+            dtoolext.extend([extensions[1]])
+    
+    for group in groupslist:
+            dtoolgroups.extend([group + ".prot"] * 2)
+            dtoolext.extend([extensions[1], extensions[0]])
+    
+    return dtoolgroups, dtoolext
+
+def zip_contrasts(contrast, PeakTools):
+    """making output file names for differential binding analyses"""
+    zipGroup1, zipGroup2, zipTool, contrasts = [], [], [], []
+    for g1, g2 in contrast:
+        for PeakTool in PeakTools:
+            zipGroup1.append(g1)
+            zipGroup2.append(g2)
+            zipTool.append(PeakTool)
+            contrasts.append( g1 + "_vs_" + g2 + "-" + PeakTool )
+    return(zipGroup1, zipGroup2, zipTool, contrasts)
\ No newline at end of file

From e89e0bd7691cb6771ffc089b0f63a29faf66ddef Mon Sep 17 00:00:00 2001
From: Ryan Routsong <routsongrm@nih.gov>
Date: Fri, 5 Jul 2024 17:50:35 -0400
Subject: [PATCH 02/28] chore: refactor script locations, workflow rules,
 remove uropa rules for time being

---
 src/run.py                                    |   4 +-
 workflow/Snakefile                            | 261 ++++---
 workflow/rules/cfChIP.smk                     |  19 +-
 workflow/rules/common.smk                     |   1 -
 workflow/rules/dba.smk                        | 180 ++---
 workflow/rules/peakcall.smk                   | 105 ++-
 workflow/rules/peakcall_qc.smk                |  46 --
 workflow/rules/qc.smk                         | 436 ++++++-----
 workflow/rules/trim_align_dedup.smk           | 700 +++++++++---------
 workflow/scripts/DiffBind_v2_ChIPseq.Rmd      | 260 -------
 .../scripts/DiffBind_v2_ChIPseq_block.Rmd     | 267 -------
 workflow/scripts/DiffBind_v2_cfChIP_QC.Rmd    | 204 -----
 workflow/scripts/FRiP_plot.R                  | 112 ---
 workflow/scripts/atac_nrf.py                  |  22 -
 workflow/scripts/bam_filter_by_mapq.py        |  40 -
 workflow/scripts/blocking.py                  |  28 +
 workflow/scripts/cfChIP_signatures.R          |  97 ---
 workflow/scripts/common.py                    |  61 +-
 workflow/scripts/frip.py                      | 164 ----
 workflow/scripts/grouping.py                  |  54 +-
 workflow/scripts/jaccard_score.py             | 202 -----
 workflow/scripts/peakcall.py                  | 105 +++
 workflow/scripts/ppqt_process.py              |  27 -
 workflow/scripts/prep_diffbind.py             |  54 --
 workflow/scripts/prep_diffbindQC.py           |  51 --
 workflow/scripts/promoterAnnotation_by_Gene.R | 179 -----
 workflow/scripts/significantPathways.R        | 127 ----
 27 files changed, 1065 insertions(+), 2741 deletions(-)
 delete mode 100644 workflow/rules/common.smk
 delete mode 100644 workflow/rules/peakcall_qc.smk
 delete mode 100644 workflow/scripts/DiffBind_v2_ChIPseq.Rmd
 delete mode 100644 workflow/scripts/DiffBind_v2_ChIPseq_block.Rmd
 delete mode 100644 workflow/scripts/DiffBind_v2_cfChIP_QC.Rmd
 delete mode 100644 workflow/scripts/FRiP_plot.R
 delete mode 100644 workflow/scripts/atac_nrf.py
 delete mode 100644 workflow/scripts/bam_filter_by_mapq.py
 create mode 100644 workflow/scripts/blocking.py
 delete mode 100755 workflow/scripts/cfChIP_signatures.R
 delete mode 100644 workflow/scripts/frip.py
 delete mode 100644 workflow/scripts/jaccard_score.py
 create mode 100644 workflow/scripts/peakcall.py
 delete mode 100644 workflow/scripts/ppqt_process.py
 delete mode 100644 workflow/scripts/prep_diffbind.py
 delete mode 100644 workflow/scripts/prep_diffbindQC.py
 delete mode 100755 workflow/scripts/promoterAnnotation_by_Gene.R
 delete mode 100755 workflow/scripts/significantPathways.R

diff --git a/src/run.py b/src/run.py
index b327cc8..34fc423 100644
--- a/src/run.py
+++ b/src/run.py
@@ -207,6 +207,7 @@ def setup(sub_args, ifiles, repo_path, output_path):
     # Add other runtime info for debugging
     config['project']['version'] = __version__
     config['project']['workpath'] = os.path.abspath(sub_args.output)
+    config['project']['binpath'] = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'bin'))
     git_hash = git_commit_hash(repo_path)
     config['project']['git_commit_hash'] = git_hash   # Add latest git commit hash
     config['project']['pipeline_path'] = repo_path    # Add path to installation
@@ -221,7 +222,8 @@ def setup(sub_args, ifiles, repo_path, output_path):
             v = str(v)
         config['options'][opt] = v
 
-
+    # initiate a few workflow vars
+    config['options']['peak_type_base'] = ["protTSS"]
     return config
 
 
diff --git a/workflow/Snakefile b/workflow/Snakefile
index 6a92c24..026c56d 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -1,21 +1,13 @@
 # Python standard library
+import datetime
+import json
 from os.path import join
 from os import listdir
-import os, sys, re, datetime
-import json
-
-# 3rd party imports from pypi
-from snakemake.workflow import workflow as wf_api
-from snakemake.utils import R
 
 # Local imports
-from scripts.common import (
-    allocated,
-    provided, 
-    references,
-    str_bool
-)
-from scripts.grouping import group_samples_by_reps, group_output_files
+from scripts.common import provided, get_file_components
+from scripts.grouping import group_samples_by_reps, \
+    group_output_files, zip_contrasts, get_peaktools
 
 configfile: "config.json"
 
@@ -23,29 +15,30 @@ configfile: "config.json"
 today                           = str(datetime.datetime.today()).split()[0].replace('-', '') # YYYYMMDD
 samples                         = config['samples']
 workpath                        = config['project']['workpath']
-tmpdir                          = config['options']['tmp_dir']
-genome                          = config['options']['genome']
 assay                           = config['options']['assay']
-blocks                          = config['project']['blocks']
-blocking                        = False if None in list(blocks.values()) else True
-convert                         = {1: False, 2: True}  # 1 = SE, 2 = PE, -1 = Unknown
-paired_end                      = convert[config['project']['nends']]  # True if PE else false
-run_dba                         = False config['options']['contrasts'] is None else True
-extensions                      = ["sorted.RPGC", "Q5DD.RPGC"]
+paired_end                      = False if config['project']['nends'] == 1 else True
 chips                           = config['project']['peaks']['chips']
 contrast                        = config['project']['contrast']
-UropaCats                       = ["protTSS", "prot", "protSEC", "genes"]
+chip2input                      = config['project']['peaks']['inputs']
+groupdata                       = config['project']['groups']
+peak_types                      = config['options']['peak_type_base']
+rule_all_ins                    = []
+groupdatawinput, groupswreps    = group_samples_by_reps(groupdata, samples, chip2input)
+PeakTools                       = get_peaktools(assay)
 zipGroup1, zipGroup2, zipToolC, contrasts \
                                 = zip_contrasts(contrast, PeakTools)
-extensionsDict                  = {"sorted": "bam", "Q5DD":"bam"} if paired_end \
-                                    else {"sorted": "bam", "Q5DD_tagAlign": "gz"}
-file_exts                       = list(extensionsDict.keys())
-extensionsFull                  = ['sorted.bam', 'Q5DD.bam'] if paired_end \
-                                    else ['sorted.bam', 'Q5DD_tagAlign.gz']
+file_stems, extRPGC, extaln     = get_file_components(paired_end)
+groups                          = list(groupdatawinput.keys())
+reps                            = False if len(groupswreps) > 0 else True
+uniq_inputs                     = list(sorted(set([v for v in chip2input.values() if v])))
+sampleswinput                   = [
+    chip_value for input_id, chip_value in chip2input.items() \
+    if chip_value != 'NA' and chip_value != ''
+]
+inputnorm                       = [""] if len(sampleswinput) == 0 else ["", ".inputnorm"]
+deepgroups, deepexts            = group_output_files(extRPGC, groups, inputnorm)
 
 # Directory end points
-trim_dir                        = "trim"
-kraken_dir                      = "kraken"
 bam_dir                         = join(workpath, "bam")
 bw_dir                          = join(workpath, "bigwig")
 qc_dir                          = join(workpath, "QC")
@@ -55,116 +48,120 @@ macsB_dir                       = join(workpath, "macsBroad")
 sicer_dir                       = join(workpath, "sicer")
 peakqc_dir                      = join(workpath, "PeakQC")
 uropa_dir                       = join(workpath, "UROPA_annotations")
-diffbind_dir                    = join(uropa_dir, "DiffBind")
+diffbind_dir                    = join(workpath, "DiffBind")
 cfTool_dir                      = join(workpath, "cfChIPtool")
 genrich_dir                     = join(workpath, "Genrich")
 MEME_dir                        = join(workpath, "MEME")
 
-# Extended data structures
-'''
-:param chip2input <dict>: map (1:1) from sample id to input
-{
-    "WT_S1": "Input_S1",
-    "WT_S2": "Input_S2",
-    "WT_S3": "Input_S3",
-    "WT_S4": "Input_S4"
-    ...
-}
-'''
-chip2input                      = config['project']['peaks']['inputs']
-
-'''
-:param groupdata <dict>: map (1:M) of group id to sample ids
-{
-    "G1": ["WT_S1", "WT_S2"],
-    "G2": ["WT_S3", "WT_S4"]
-    ...
-}
-'''
-groupdata                       = config['project']['groups']
-
-'''
-:param groupdatawinput <dict>: 
-{
-    "G1": ["WT_S1", "WT_S2"],
-    "G2": ["WT_S3", "WT_S4"]
-    ...
-}
-
-:param groupswreps <list>:
-    ["G1", "G2", ...]
-'''
-groupdatawinput, groupswreps    = group_samples_by_reps(groupdata, samples, chip2input)
-groups                          = list(groupdatawinput.keys())
-reps                            = False if len(groupswreps) > 0 else True
-uniq_inputs                     = list(sorted(set([v for v in chip2input.values() if v])))
-sampleswinput                   = [
-    chip_value for input_id, chip_value in chip2input.items() \
-    if chip_value != 'NA' and chip_value != ''
-]
-inputnorm                       = [""] if len(sampleswinput) == 0 else ["", ".inputnorm"]
-deepgroups, deepexts            = group_output_files(extensions, groups,inputnorm)
-
-# Read in resource information,
-# containing information about 
-# threads, mem, walltimes, etc.
+# Read in resource information
 with open(join('config', 'cluster.json')) as fh:
     cluster = json.load(fh)
 
+if assay == "cfchip":
+    rule_all_ins.append(join(
+        qc_dir, "H3K4me3_cfChIP_signature.txt"
+    ))
+    rule_all_ins.extend(expand(
+        join(qc_dir, "{name}.{stem}.insert_size_metrics.txt"), 
+        name=samples, 
+        stem=file_stems
+    ))
+    rule_all_ins.extend(expand(
+        join(cfTool_dir, "Output", "H3K4me3", "Signatures", "{name}.Q5DD.csv"), 
+        name=chips
+    ))
+    rule_all_ins.extend(expand(
+        join(uropa_dir, "{PeakTool}", "{name}_{PeakTool}_uropa_{_type}_allhits.txt"),
+        PeakTool=PeakTools, 
+        name=chips, 
+        _type=peak_types
+    ))
+    rule_all_ins.extend(expand(
+        join(uropa_dir, "QC", "AllSamples-macsNarrow_{PeakTool}_uropa_{_type}_allhits.txt"),
+        PeakTool="DiffBindQC", 
+        _type=peak_types
+    ))
+    rule_all_ins.extend(expand(
+        join(uropa_dir, "promoterTable1", "{PeakTool}_promoter_overlap_summaryTable.txt"), 
+        PeakTool=PeakTools
+    ))
+    # rule_all_ins.extend(expand(
+    #     join(diffbind_dir, "{name}_{PeakTool}_uropa_{_type}_allhits.txt"),
+    #     PeakTool=['DiffbindEdgeR','DiffbindDeseq2'], 
+    #     name=contrasts, 
+    #     _type=peak_types
+    # ))
+    if reps:
+        rule_all_ins.extend(expand(
+            join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind.html"), 
+            group1=zipGroup1, group2=zipGroup2, PeakTool=zipToolC
+        ))
+        rule_all_ins.extend(expand(
+            join(diffbind_dir, "{name}_{PeakTool}_uropa_{type}_allhits.txt"), PeakTool=['DiffbindEdgeR','DiffbindDeseq2'], 
+            name=contrasts, _type=peak_types
+        ))
+elif assay in ["atac", "chip"]:
+    peak_types.extend(["prot", "protSEC", "genes"])
+    rule_all_ins.extend(expand(join(MEME_dir, "{PeakTool}", "{name}_meme", "meme-chip.html"), PeakTool=PeakTools, name=chips))
+    rule_all_ins.extend(expand(join(MEME_dir, "{PeakTool}", "{name}_ame", "ame.html"), PeakTool=PeakTools, name=chips))
+    if paired_end:
+        rule_all_ins.extend(expand(join(qc_dir, "{name}.{stem}.insert_size_metrics.txt"), name=samples, stem=file_stems))
+    if assay == "chip":
+        rule_all_ins.extend(expand(join(macsB_dir, "{name}", "{name}_peaks.broadPeak"), name=chips))
+        rule_all_ins.extend(expand(join(sicer_dir, "{name}", "{name}_broadpeaks.bed"), name=chips))
+        if paired_end:
+            short_ext, tag_ext = ["sorted", "Q5DD"], ["sorted", "Q5DD_tagAlign"]
+            rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.ppqt"), name=samples, ext=short_ext))
+            rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.pdf"), name=samples, ext=short_ext))
+            rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.ppqt.txt"), name=samples, ext=short_ext))
+            rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.ppqt"), name=samples, ext=tag_ext))
+            rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.pdf"), name=samples, ext=tag_ext))
+            rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.ppqt.txt"), name=samples, ext=tag_ext))
+    if assay == "atac":
+        rule_all_ins.extend(expand(
+            join(genrich_dir, "{name}", "{name}.narrowPeak"), name=chips
+        ))
+    if reps:
+        rule_all_ins.extend(expand(
+            join(uropa_dir, "{PeakTool}", "{name}_{PeakTool}_uropa_{type}_allhits.txt"),
+            PeakTool=PeakTools, name=chips, _type=peak_types
+        ))
+        rule_all_ins.extend(expand(
+            join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind.html"), 
+            group1=zipGroup1, group2=zipGroup2, PeakTool=zipToolC
+        ))
+        # rule_all_ins.extend(expand(
+        #     join(uropa_dir, "{name}_{PeakTool}_uropa_{type}_allhits.txt"), 
+        #     PeakTool=["DiffbindEdgeR", "DiffbindDeseq2"], 
+        #     name=contrasts, 
+        #     _type=peak_types
+        # ))
+        if contrast:
+            rule_all_ins.extend(expand(
+                join(uropa_dir, "promoterTable2", "DiffbindDeseq2_{PeakTool}_promoter_overlap_summaryTable.txt"), 
+                PeakTool=PeakTools
+            ))
+rule_all_ins.append(join(workpath,"multiqc_report.html"))
+rule_all_ins.extend(expand(join(qc_dir, "{name}.preseq.dat"), name=samples))
+rule_all_ins.extend(
+    expand(join(qc_dir, "AllSamples-{PeakTool}", "AllSamples-{PeakTool}_DiffBindQC_TMMcounts.bed"), PeakTool=PeakTools)
+)
+rule_all_ins.extend(expand(join(bam_dir, "{name}.{ext}"), name=samples, ext=extaln))
+rule_all_ins.extend(expand(join(macsN_dir, "{name}","{name}_peaks.narrowPeak"), name=chips))
+rule_all_ins.extend(
+    expand(join(peakqc_dir, "{PeakTool}.{name}.Q5DD.FRiP_table.txt"), PeakTool=PeakTools, name=samples)
+)
+rule_all_ins.extend(expand(join(bw_dir, "{name}.{ext}.RPGC.bw"), name=samples, ext=["sorted", "Q5DD"]))
+rule_all_ins.extend(expand(join(bw_dir, "{name}.Q5DD.RPGC.inputnorm.bw"), name=sampleswinput))
+
 rule all:
     input:
-        if assay == "cfchip":
-            peak_types = ["protTSS"]
-            join(qc_dir, "H3K4me3_cfChIP_signature.txt"),
-            expand(join(qc_dir, "{name}.{ext}.insert_size_metrics.txt"), name=samples, ext=file_exts)
-            expand(join(cfTool_dir, "Output", "H3K4me3", "Signatures", "{name}.Q5DD.csv"), name=chips),
-            expand(join(uropa_dir, "{PeakTool}", "{name}_{PeakTool}_uropa_{_type}_allhits.txt"),
-                PeakTool=PeakTools, name=chips, _type=peak_types),
-            expand(join(uropa_dir, "QC", "AllSamples-macsNarrow_{PeakTool}_uropa_{_type}_allhits.txt"),
-                PeakTool="DiffBindQC", _type=peak_types),
-            expand(join(uropa_dir, "promoterTable1", "{PeakTool}_promoter_overlap_summaryTable.txt"), PeakTool=PeakTools),
-            expand(join(diffbind_dir, "{name}_{PeakTool}_uropa_{_type}_allhits.txt"),
-                PeakTool=['DiffbindEdgeR','DiffbindDeseq2'], name=contrasts, _type=peak_types)
-            if reps:
-                expand(join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind.html"), 
-                    group1=zipGroup1, group2=zipGroup2, PeakTool=zipToolC)
-                expand(join(diffbind_dir, "{name}_{PeakTool}_uropa_{type}_allhits.txt"), 
-                    PeakTool=['DiffbindEdgeR','DiffbindDeseq2'], name=contrasts, _type=peak_types)
+        rule_all_ins
 
-        else assay in ["atac", "chip"]:
-            peak_types = ["protTSS", "prot", "protSEC", "genes"]
-            expand(join(MEME_dir, "{PeakTool}", "{name}_meme", "meme-chip.html"), PeakTool=PeakTools, name=chips),
-            expand(join(MEME_dir, "{PeakTool}", "{name}_ame", "ame.html"), PeakTool=PeakTools, name=chips)
-            if paired_end:
-                expand(join(qc_dir, "{name}.{ext}.insert_size_metrics.txt"), name=samples, ext=file_exts)
-            if assay == "chip":
-                expand(join(macsB_dir, "{name}", "{name}_peaks.broadPeak"), name=chips)
-                expand(join(sicer_dir, "{name}", "{name}_broadpeaks.bed"), name=chips)
-                if paired_end:
-                    short_ext, tag_ext = ["sorted", "Q5DD"], ["sorted", "Q5DD_tagAlign"]
-                    expand(join(ppqt_dir, "{name}.{ext}.ppqt"), name=samples, ext=short_ext),
-                    expand(join(ppqt_dir, "{name}.{ext}.pdf"), name=samples, ext=short_ext),
-                    expand(join(ppqt_dir, "{name}.{ext}.ppqt.txt"), name=samples, ext=short_ext)
-                    expand(join(ppqt_dir, "{name}.{ext}.ppqt"), name=samples, ext=tag_ext)
-                    expand(join(ppqt_dir, "{name}.{ext}.pdf"), name=samples, ext=tag_ext)
-                    expand(join(ppqt_dir, "{name}.{ext}.ppqt.txt"), name=samples, ext=tag_ext)
-            if assay == "atac":
-                expand(join(genrich_dir, "{name}", "{name}.narrowPeak"), name=chips)
-            if reps:
-                expand(join(uropa_dir, "{PeakTool}", "{name}_{PeakTool}_uropa_{type}_allhits.txt"),
-                    PeakTool=PeakTools, name=chips, _type=peak_types),
-                expand(join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind.html"), 
-                    group1=zipGroup1, group2=zipGroup2, PeakTool=zipToolC)
-                expand(join(diffbind_dir, "{name}_{PeakTool}_uropa_{type}_allhits.txt"), 
-                    PeakTool=["DiffbindEdgeR", "DiffbindDeseq2"], name=contrasts, _type=peak_types)
-                if contrast:
-                    expand(join(uropa_dir, "promoterTable2", "DiffbindDeseq2_{PeakTool}_promoter_overlap_summaryTable.txt"), 
-                        PeakTool=PeakTools)
-        join(workpath,"multiqc_report.html"),
-        expand(join(qc_dir, "{name}.preseq.dat"), name=samples),
-        expand(join(qc_dir, "AllSamples-{PeakTool}", "AllSamples-{PeakTool}_DiffBindQC_TMMcounts.bed"), PeakTool=PeakTools),
-        expand(join(bam_dir, "{name}.{ext}"), name=samples, ext=extensionsFull),
-        expand(join(macsN_dir, "{name}","{name}_peaks.narrowPeak"), name=chips),
-        expand(join(peakqc_dir, "{PeakTool}.{name}.Q5DD.FRiP_table.txt"), PeakTool=PeakTools, name=samples),
-        expand(join(bw_dir, "{name}.{ext}.RPGC.bw"), name=samples, ext=["sorted", "Q5DD"]),
-        expand(join(bw_dir, "{name}.Q5DD.RPGC.inputnorm.bw"), name=sampleswinput),
\ No newline at end of file
+# Include child rules
+include: join("rules", "hooks.smk")
+include: join("rules", "trim_align_dedup.smk")
+include: join("rules", "qc.smk")
+include: join("rules", "peakcall.smk")
+include: join("rules", "dba.smk")
+include: join("rules", "cfChIP.smk")
\ No newline at end of file
diff --git a/workflow/rules/cfChIP.smk b/workflow/rules/cfChIP.smk
index fa521d6..672a05b 100644
--- a/workflow/rules/cfChIP.smk
+++ b/workflow/rules/cfChIP.smk
@@ -1,7 +1,16 @@
-# cell-free DNA ChIP-seq rules:
-#   - picard_dedup 
-#   - cfChIPtool
-#   - cfChIPcompile
+# cell-free ChIP-seq
+# ~~~~
+# rules: picard_dedup, cfChIPtool, cfChIPcompile
+
+
+# ~~ workflow configuration
+workpath                        = config['project']['workpath']
+genome                          = config['options']['genome']
+blocks                          = config['project']['blocks']
+groupdata                       = config['project']['groups']
+
+
+# ~~ directories
 
 
 rule cfChIPtool:
@@ -34,7 +43,7 @@ rule cfChIPtool:
 
 rule cfChIPcompile:
     input:
-        expand(join(workpath,cfTool_dir,"Output","H3K4me3","Signatures","{name}.Q5DD.csv"),name=chip)
+        expand(join(cfTool_dir, "Output", "H3K4me3", "Signatures", "{name}.Q5DD.csv"), name=chips)
     output:
         txt=join(workpath,"QC","H3K4me3_cfChIP_signature.txt"),
         pdf=join(workpath,"QC","H3K4me3_cfChIP_signature.pdf")
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
deleted file mode 100644
index a41b6b5..0000000
--- a/workflow/rules/common.smk
+++ /dev/null
@@ -1 +0,0 @@
-from scripts.common import abstract_location
diff --git a/workflow/rules/dba.smk b/workflow/rules/dba.smk
index 036bab1..69aecc4 100644
--- a/workflow/rules/dba.smk
+++ b/workflow/rules/dba.smk
@@ -1,72 +1,46 @@
-# TODO: This Snakefile needs to be completely refactored.
-# Python standard library
+# Differential binding analysis rules
+# ~~~~
 from os.path import join
 import os
+from scripts.common import allocated, mk_dir_if_not_exist
+from scripts.peakcall import outputIDR, zip_peak_files, calc_effective_genome_fraction
+from scripts.blocking import test_for_block
 
-# Local imports
-from scripts.common import (
-    allocated
-)
-
-def outputIDR(groupswreps, groupdata, chip2input, tools):
-    """
-    Produces the correct output files for IDR. All supposed replicates
-    should be directly compared when possible using IDR. IDR malfunctions
-    with bed files and GEM so it will not run with either of those.
-    Because there is no q-value calculated for SICER when there is no 
-    input file, those samples are also ignored.
-    """
-    IDRgroup, IDRsample1, IDRsample2, IDRpeaktool = [], [], [], []
-    for group in groupswreps:
-        nsamples = len(groupdata[group])
-        for i in range(nsamples):
-            ctrlTF = chip2input[groupdata[group][i]] != ""
-            for j in range(i+1,nsamples):
-                if ctrlTF == (chip2input[groupdata[group][j]] != ""):
-                    if ctrlTF == False:
-                        tooltmp = [ tool for tool in tools if tool != "sicer" ]
-                    else:
-                        tooltmp = tools			           
-                    IDRgroup.extend([group] * len(tooltmp))
-                    IDRsample1.extend([groupdata[group][i]] * len(tooltmp))
-                    IDRsample2.extend([groupdata[group][j]] * len(tooltmp))
-                    IDRpeaktool.extend(tooltmp)
-    return( IDRgroup, IDRsample1, IDRsample2, IDRpeaktool )
 
+# ~~ workflow configuration
+workpath                        = config['project']['workpath']
+genome                          = config['options']['genome']
+blocks                          = config['project']['blocks']
+groupdata                       = config['project']['groups']
 
-def zip_peak_files(chips, PeakTools, PeakExtensions):
-    """Making input file names for FRiP"""
-    zipSample, zipTool, zipExt = [], [], []
-    for chip in chips:
-        for PeakTool in PeakTools:
-            zipSample.append(chip)
-            zipTool.append(PeakTool)
-            zipExt.append(PeakExtensions[PeakTool])
-    return(zipSample, zipTool, zipExt)
 
+# ~~ directories
+bin_path                        = join(workpath, "workflow", "bin")
+diffbind_dir_block              = join(workpath, "DiffBindBlock")
+diffbind_dir2                   = join(workpath, "DiffBind_block")
+diffbind_dir                    = join(workpath, "DiffBind")
+bam_dir                         = join(workpath, "bam")
+qc_dir                          = join(workpath, "PeakQC")
+idr_dir                         = join(workpath, "IDR")
+memechip_dir                    = join(workpath, "MEME")
+homer_dir                       = join(workpath, "HOMER_motifs")
+uropa_dir                       = join(workpath, "UROPA_annotations")
+manorm_dir                      = join(workpath, "MANorm")
+downstream_dir                  = join(workpath, "Downstream")
+otherDirs                       = [qc_dir, homer_dir, uropa_dir]
+cfTool_dir                      = join(workpath, "cfChIPtool")
+cfTool_subdir2                  = join(cfTool_dir, "BED", "H3K4me3")
 
-def calc_effective_genome_fraction(effectivesize, genomefile):
-    """
-    calculate the effective genome fraction by calculating the
-    actual genome size from a .genome-like file and then dividing
-    the effective genome size by that number
-    """
-    lines=list(map(lambda x:x.strip().split("\t"),open(genomefile).readlines()))
-    genomelen=0
-    for chrom,l in lines:
-        if not "_" in chrom and chrom!="chrX" and chrom!="chrM" and chrom!="chrY":
-            genomelen+=int(l)
-    return(str(float(effectivesize)/ genomelen))
 
 
+# ~~ workflow switches
+blocking                        = False if None in list(blocks.values()) else True
+if reps == "yes": otherDirs.append(diffbind_dir)
+mk_dir_if_not_exist(PeakTools + otherDirs)
 
-# PREPARING TO DEAL WITH A VARIED SET OF PEAKCALL TOOLS
-gem_dir = "gem"
-macsB_dir = "macsBroad"
-sicer_dir = "sicer"
 
+# ~~ peak calling configuration and outputs
 PeakToolsNG = [ tool for tool in PeakTools if tool != "gem" ]
-
 PeakExtensions = {
     'macsNarrow': '_peaks.narrowPeak',
     'macsBroad': '_peaks.broadPeak',
@@ -106,71 +80,29 @@ RankColIDR = {
     'macsBroad': 'q.value',
     'sicer': 'q.value'
 }
-
-
 IDRgroup, IDRsample1, IDRsample2, IDRpeaktool =	outputIDR(groupswreps, groupdata, chip2input, PeakToolsNG)
-
 zipSample, zipTool, zipExt = zip_peak_files(chips, PeakTools, PeakExtensions)
-
-
-# CREATING DIRECTORIES
-bam_dir='bam'
-qc_dir='PeakQC'
-idr_dir = 'IDR'
-memechip_dir = "MEME"
-homer_dir = "HOMER_motifs"
-manorm_dir = "MANorm"
-downstream_dir = "Downstream"
-
-otherDirs = [qc_dir, homer_dir, uropa_dir]
-if reps == "yes":
-    # otherDirs.append(idr_dir)
-    otherDirs.append(diffbind_dir)
-
-for d in PeakTools + otherDirs:
-        if not os.path.exists(join(workpath,d)):
-                os.mkdir(join(workpath,d))
-
-
-# Blocking code
-diffbind_dir2 = "DiffBind_block"
-blocks=config['project']['blocks']
-
-def test_for_block(contrast, blocks):
-   """ only want to run blocking on contrasts where all
-   individuals are on both sides of the contrast """
-   contrastBlock = [ ]
-   for con in contrast:
-       group1 = con[0]
-       group2 = con[1]
-       block1 = [ blocks[sample] for sample in groupdata[group1] ]
-       block2 = [ blocks[sample] for sample in groupdata[group2] ]
-       if len(block1) == len(block2):
-           if len(set(block1).intersection(block2)) == len(block1):
-                contrastBlock.append(con)
-   return contrastBlock  
-
-
-contrastBlock = test_for_block(contrast,blocks)
+contrastBlock = test_for_block(groupdata, contrast, blocks)
 zipGroup1B, zipGroup2B, zipToolCB, contrastsB = zip_contrasts(contrastBlock, PeakTools)
 
+# ~~ rules 
 
 rule diffbind:
     input:
         lambda w: [ join(workpath, w.PeakTool, chip, chip + PeakExtensions[w.PeakTool]) for chip in chips ]
     output:
-        html = join(workpath,diffbind_dir,"{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind.html"),
-        Deseq2 = join(workpath,diffbind_dir,"{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2.bed"),
-        EdgeR = join(workpath,diffbind_dir,"{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR.bed"),
-        EdgeR_txt = join(workpath,diffbind_dir,"{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR.txt"),
-        Deseq2_txt = join(workpath,diffbind_dir,"{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2.txt"),
-        EdgeR_ftxt = join(workpath,diffbind_dir,"{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR_fullList.txt"),
-        Deseq2_ftxt = join(workpath,diffbind_dir,"{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2_fullList.txt"),
-        html_block = provided(join(workpath,diffbind_dir_block,"{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind_blocking.html"), blocking)
+        html = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind.html"),
+        Deseq2 = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2.bed"),
+        EdgeR = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR.bed"),
+        EdgeR_txt = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR.txt"),
+        Deseq2_txt = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2.txt"),
+        EdgeR_ftxt = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR_fullList.txt"),
+        Deseq2_ftxt = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2_fullList.txt"),
+        html_block = provided(join(diffbind_dir_block, "{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind_blocking.html"), blocking)
     params:
-        rname="diffbind",
-        rscript = join(workpath,"workflow","scripts","DiffBind_v2_ChIPseq.Rmd"),
-        outdir    = join(workpath,diffbind_dir,"{group1}_vs_{group2}-{PeakTool}"),
+        rname = "diffbind",
+        rscript = join(workpath, "workflow", "scripts","DiffBind_v2_ChIPseq.Rmd"),
+        outdir    = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}"),
         contrast  = "{group1}_vs_{group2}",
         csvfile   = join(workpath,diffbind_dir,"{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind_prep.csv"),
         pythonscript = join(workpath,"workflow","scripts","prep_diffbind.py"),
@@ -212,15 +144,15 @@ if assay == "cfchip":
         input:
             lambda w: [ join(workpath, w.PeakTool1, w.name, w.name + PeakExtensions[w.PeakTool2]) ]
         output:
-            txt=join(workpath, uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_allhits.txt'),
-            bed1=temp(join(workpath, uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_allhits.bed')),
-            bed2=temp(join(workpath, uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_finalhits.bed')),
+            txt=join(uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_allhits.txt'),
+            bed1=temp(join(uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_allhits.bed')),
+            bed2=temp(join(uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_finalhits.bed')),
         params:
             rname="uropa",
             uropaver = config['tools']['UROPAVER'],
-            fldr = join(workpath, uropa_dir, '{PeakTool1}'),
-            json = join(workpath, uropa_dir, '{PeakTool1}','{name}.{PeakTool2}.{type}.json'),
-            outroot = join(workpath, uropa_dir, '{PeakTool1}','{name}_{PeakTool2}_uropa_{type}'),
+            fldr = join(uropa_dir, '{PeakTool1}'),
+            json = join(uropa_dir, '{PeakTool1}','{name}.{PeakTool2}.{type}.json'),
+            outroot = join(uropa_dir, '{PeakTool1}','{name}_{PeakTool2}_uropa_{type}'),
             gtf = config['references'][genome]['GTFFILE'],
             threads = 4,
         shell: """
@@ -244,15 +176,15 @@ else:
         input:
             lambda w: [ join(workpath, w.PeakTool1, w.name, w.name + PeakExtensions[w.PeakTool2]) ]
         output:
-            txt=join(workpath, uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_allhits.txt'),
-            bed1=temp(join(workpath, uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_allhits.bed')),
-            bed2=temp(join(workpath, uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_finalhits.bed')),
+            txt=join(uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_allhits.txt'),
+            bed1=temp(join(uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_allhits.bed')),
+            bed2=temp(join(uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_finalhits.bed')),
         params:
             rname="uropa",
             uropaver = config['tools']['UROPAVER'],
-            fldr = join(workpath, uropa_dir, '{PeakTool1}'),
-            json = join(workpath, uropa_dir, '{PeakTool1}','{name}.{PeakTool2}.{type}.json'),
-            outroot = join(workpath, uropa_dir, '{PeakTool1}','{name}_{PeakTool2}_uropa_{type}'),
+            fldr = join(uropa_dir, '{PeakTool1}'),
+            json = join(uropa_dir, '{PeakTool1}','{name}.{PeakTool2}.{type}.json'),
+            outroot = join(uropa_dir, '{PeakTool1}','{name}_{PeakTool2}_uropa_{type}'),
             gtf = config['references'][genome]['GTFFILE'],
             threads = 4,
         shell: """
diff --git a/workflow/rules/peakcall.smk b/workflow/rules/peakcall.smk
index 1decc59..466d639 100644
--- a/workflow/rules/peakcall.smk
+++ b/workflow/rules/peakcall.smk
@@ -1,35 +1,26 @@
 
-# Helper functions
-def get_input_bam(wildcards):
-    """
-    Returns a ChIP samples input BAM file,
-    see chip2input for ChIP, Input pairs.
-    """
-    input_sample = chip2input[wildcards.name]
-    if input_sample:
-        # Runs in a ChIP, input mode
-        return join(workpath, bam_dir, "{0}.Q5DD.bam".format(input_sample))
-    else:
-        # Runs in ChIP-only mode
-        return []
+# Quality control rules
+# ~~~~
+# Common quality-control rules: preseq, NRF, rawfastqc,
+#   fastqc, fastq_screen, multiQC
+from os.path import join
+from scripts.peakcall import get_control_input, getMacTXT, getMacChip
+
+
+# ~~ workflow configuration
+workpath                        = config['project']['workpath']
+genome                          = config['options']['genome']
+paired_end                      = False if config['project']['nends'] == 1 else True
+chip2input                      = config['project']['peaks']['inputs']
+
+# Directory end points
+bam_dir                         = join(workpath, "bam")
+ppqt_dir                        = join(bam_dir, "ppqt")
+genrich_dir                     = join(workpath, "Genrich")
+macsN_dir                       = join(workpath, "macsNarrow")
+macsB_dir                       = join(workpath, "macsBroad")
+sicer_dir                       = join(workpath, "sicer")
 
-def get_control_input(wildcards):
-    if paired_end and chip2input[wildcards.name] != "":
-        i = [
-            join(workpath, bam_dir, "{0}.Q5DD.bam".format(chip2input[wildcards.name]))
-        ]
-        return i 
-    elif paired_end and chip2input[wildcards.name] == "":
-        i = []
-        return i 
-    elif not paired_end and chip2input[wildcards.name] != "":
-        i = [
-            join(workpath, bam_dir, "{0}.Q5DD_tagAlign.gz".format(chip2input[wildcards.name]))
-        ]
-        return i
-    else:
-        i = []
-        return i
 
 rule sortByRead:
     """
@@ -41,9 +32,9 @@ rule sortByRead:
         Bam file sorted by read name (extension: sortByRead.bam)
     """
     input:
-        join(workpath,bam_dir,"{name}.sorted.bam")
+        join(bam_dir, "{name}.sorted.bam")
     output:
-        temp(join(workpath,bam_dir,"{name}.sortedByRead.bam"))
+        temp(join(bam_dir, "{name}.sortedByRead.bam"))
     params:
         rname="sortByRead",
         samtools=config['tools']['SAMTOOLSVER'],
@@ -69,9 +60,9 @@ rule genrich:
         summit -log(q-value), and summit position.
     """
     input: 
-        join(workpath,bam_dir,"{name}.sortedByRead.bam")
+        join(bam_dir, "{name}.sortedByRead.bam")
     output: 
-        join(workpath,"Genrich","{name}","{name}.narrowPeak")
+        join(genrich_dir, "{name}", "{name}.narrowPeak")
     params:
         rname="genrich",
         genrich_ver=config['tools']['GENRICHVER']
@@ -92,13 +83,11 @@ rule genrich:
 # INDIVIDUAL RULES
 rule MACS2_narrow:
     input:
-        chip = lambda w: join(workpath,bam_dir, w.name+".Q5DD.bam") \
-        if paired_end else join(workpath,bam_dir, w.name+".Q5DD_tagAlign.gz"),
-        txt = lambda w: join(workpath,bam_dir, ppqt_dir, w.name+".Q5DD.ppqt.txt") \
-        if paired_end else join(workpath,bam_dir, ppqt_dir, w.name+".Q5DD_tagAlign.ppqt.txt"),
-        c_option = get_control_input
+        chip = lambda w: getMacChip(bam_dir, w.name, paired_end),
+        txt = lambda w: getMacTXT(ppqt_dir, w.name, paired_end),
+        c_option = lambda w: get_control_input(chip2input[w.name], paired_end, bam_dir),
     output:
-        join(workpath,macsN_dir,"{name}","{name}_peaks.narrowPeak"),
+        join(macsN_dir, "{name}", "{name}_peaks.narrowPeak"),
     params:
         rname='MACS2_narrow',
         gsize=config['references'][genome]['EFFECTIVEGENOMESIZE'],
@@ -112,7 +101,7 @@ rule MACS2_narrow:
             -t {input.chip} {params.flag} {input.c_option} \\
             -g {params.gsize} \\
             -n {wildcards.name} \\
-            --outdir {workpath}/{macsN_dir}/{wildcards.name} \\
+            --outdir {macsN_dir}/{wildcards.name} \\
             -q 0.01 \\
             --keep-dup="all" \\
             -f "BAMPE"
@@ -122,7 +111,7 @@ rule MACS2_narrow:
             -t {input.chip} {params.flag} {input.c_option} \\
             -g {params.gsize} \\
             -n {wildcards.name} \\
-            --outdir {workpath}/{macsN_dir}/{wildcards.name} \\
+            --outdir {macsN_dir}/{wildcards.name} \\
             -q 0.01 \\
             --keep-dup="all" \\
             --nomodel \\
@@ -132,13 +121,11 @@ rule MACS2_narrow:
 
 rule MACS2_broad:
     input:
-        chip = lambda w: join(workpath,bam_dir, w.name+".Q5DD.bam") \
-        if paired_end else join(workpath,bam_dir, w.name+".Q5DD_tagAlign.gz"),
-        txt = lambda w: join(workpath,bam_dir, ppqt_dir, w.name+".Q5DD.ppqt.txt") \
-        if paired_end else join(workpath,bam_dir, ppqt_dir, w.name+".Q5DD_tagAlign.ppqt.txt"),
-        c_option = get_control_input
+        chip = lambda w: getMacChip(bam_dir, w.name, paired_end),
+        txt = lambda w: getMacTXT(ppqt_dir, w.name, paired_end),
+        c_option = lambda w: get_control_input(chip2input[w.name], paired_end, bam_dir),
     output:
-        join(workpath,macsB_dir,"{name}","{name}_peaks.broadPeak"),
+        join(macsB_dir, "{name}", "{name}_peaks.broadPeak"),
     params:
         rname='MACS2_broad',
         gsize=config['references'][genome]['EFFECTIVEGENOMESIZE'],
@@ -152,7 +139,7 @@ rule MACS2_broad:
             -t {input.chip} {params.flag} {input.c_option} \\
             -g {params.gsize} \\
             -n {wildcards.name} \\
-            --outdir {workpath}/{macsB_dir}/{wildcards.name} \\
+            --outdir {macsB_dir}/{wildcards.name} \\
             --broad \\
             --broad-cutoff 0.01 \\
             --keep-dup="all" \\
@@ -163,7 +150,7 @@ rule MACS2_broad:
             -t {input.chip} {params.flag} {input.c_option} \\
             -g {params.gsize} \\
             -n {wildcards.name} \\
-            --outdir {workpath}/{macsB_dir}/{wildcards.name} \\
+            --outdir {macsB_dir}/{wildcards.name} \\
             --broad \\
             --broad-cutoff 0.01 \\
             --keep-dup="all" \\
@@ -174,20 +161,18 @@ rule MACS2_broad:
 
 rule SICER:
     input: 
-        chip = lambda w: join(workpath,bam_dir, w.name+".Q5DD.bam") \
-        if paired_end else join(workpath,bam_dir, w.name+".Q5DD_tagAlign.gz"),
-        fragLen =lambda w: join(workpath,bam_dir, ppqt_dir, w.name+".Q5DD_tagAlign.ppqt.txt") if \
-            not paired_end else join(workpath,"QC", w.name+".Q5DD.insert_size_metrics.txt"),
-        c_option = get_control_input
+        chip = lambda w: getSicerChips(bam_dir, w.name, paired_end),
+        fragLen = lambda w: getSicerFragLen(ppqt_dir, qc_dir, w.name, paired_end),
+        c_option = lambda w: get_control_input(chip2input[w.name], paired_end, bam_dir),
     output:
-        bed = join(workpath,sicer_dir,"{name}","{name}_broadpeaks.bed"),
+        bed = join(sicer_dir, "{name}", "{name}_broadpeaks.bed"),
     params:
         rname='SICER',
         sicerver=config['tools']['SICERVER'],
         bedtoolsver=config['tools']['BEDTOOLSVER'],
         genomever = config['options']['genome'],
         name="{name}",
-        sicer_dir=join(workpath,sicer_dir,"{name}"),
+        sicer_dir=join(sicer_dir,"{name}"),
         tmpdir=tmpdir,
         paired_end = paired_end,
         frac=config['references'][genome]['FRAC'],
@@ -298,15 +283,15 @@ rule MEME:
     input:
         bed = lambda w: join(workpath, w.PeakTool, w.name, w.name + PeakExtensions[w.PeakTool])
     output:
-        meme_out = join(workpath, "MEME", "{PeakTool}", "{name}_meme", "meme-chip.html"),
-        ame_out = join(workpath, "MEME", "{PeakTool}", "{name}_ame", "ame.html")
+        meme_out = join(MEME_dir, "{PeakTool}", "{name}_meme", "meme-chip.html"),
+        ame_out = join(MEME_dir, "{PeakTool}", "{name}_ame", "ame.html")
     params:
         rname='MEME',
         ref_fa=config['references'][genome]['GENOME'],
         meme_vertebrates_db=config['references'][genome]['MEME_VERTEBRATES_DB'],
         meme_euk_db=config['references'][genome]['MEME_EUKARYOTE_DB'],
         meme_genome_db=config['references'][genome]['MEME_GENOME_DB'],
-        oc=join(workpath, "MEME", "{PeakTool}", "{name}"),
+        oc=join(MEME_dir, "{PeakTool}", "{name}"),
         tmpdir=tmpdir,
         outfa="{name}.fa",
         ntasks=int(28)
diff --git a/workflow/rules/peakcall_qc.smk b/workflow/rules/peakcall_qc.smk
deleted file mode 100644
index 5b41f4b..0000000
--- a/workflow/rules/peakcall_qc.smk
+++ /dev/null
@@ -1,46 +0,0 @@
-rule FRiP:
-    input:
-        bed = lambda w: [ join(workpath, w.PeakTool, chip, chip + PeakExtensions[w.PeakTool]) for chip in chips ],
-        bam = join(workpath,bam_dir,"{name}.Q5DD.bam"),
-    output:
-        join(workpath,"PeakQC","{PeakTool}.{name}.Q5DD.FRiP_table.txt"),
-    params:
-        rname="frip",
-        outroot = lambda w: join(workpath,"PeakQC",w.PeakTool),
-        script=join(workpath,"workflow","scripts","frip.py"),
-        genome = config['references'][genome]['REFLEN'],
-        tmpdir = tmpdir,
-    container: config['images']['python']
-    shell: """
-    # Setups temporary directory for
-    # intermediate files with built-in 
-    # mechanism for deletion on exit
-    if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi
-    tmp=$(mktemp -d -p "{params.tmpdir}")
-    trap 'rm -rf "${{tmp}}"' EXIT
-
-    python {params.script} \\
-        -p {input.bed} \\
-        -b {input.bam} \\
-        -g {params.genome} \\
-        -o {params.outroot}
-    """
-
-rule jaccard:
-    input:
-        lambda w: [ join(workpath, w.PeakTool, chip, chip + PeakExtensions[w.PeakTool]) for chip in chips ],
-    output:
-        join(workpath,qc_dir,'{PeakTool}_jaccard.txt'),
-    params:
-        rname="jaccard",
-        outroot = lambda w: join(workpath,qc_dir,w.PeakTool),
-        script=join(workpath,"workflow","scripts","jaccard_score.py"),
-        genome = config['references'][genome]['REFLEN']
-    envmodules:
-        config['tools']['BEDTOOLSVER']
-    shell: """
-    python {params.script} \\
-        -i "{input}" \\
-        -o "{params.outroot}" \\
-        -g {params.genome}
-    """
diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk
index 7ea370b..894c0f4 100644
--- a/workflow/rules/qc.smk
+++ b/workflow/rules/qc.smk
@@ -1,11 +1,25 @@
-# Common quality-control rules
-# Includes the following:
-#   - preseq
-#   - NRF
-#   - rawfastqc
-#   - fastqc
-#   - fastq_screen
-#   - multiQC
+# Quality control rules
+# ~~~~
+# Common quality-control rules: preseq, NRF, rawfastqc,
+#   fastqc, fastq_screen, multiQC
+from os.path import join
+from scripts.common import get_bam_ext, get_fqscreen_outputs
+
+
+# ~~ workflow configuration
+workpath                        = config['project']['workpath']
+genome                          = config['options']['genome']
+paired_end                      = False if config['project']['nends'] == 1 else True
+samples                         = config['samples']
+ends                            = [1] if not paired_end else [1, 2]
+ 
+# ~~ directories
+qc_dir                          = join(workpath, "QC")
+kraken_dir                      = join(workpath, 'kraken')
+deeptools_dir                   = join(workpath, 'deeptools')
+extra_fingerprint_dir           = join(deeptools_dir, 'sorted_fingerprint')
+
+
 rule preseq:
     """
     Quality step to estimate library complexity. Low library complexity may indicate
@@ -17,19 +31,20 @@ rule preseq:
         Logfile containing library complexity information
     """
     input:
-        bam = join(workpath,bam_dir,"{name}.sorted.bam"),
+        bam                     = join(bam_dir, "{name}.sorted.bam"),
     output:
-        ccurve = join(workpath,qc_dir,"{name}.ccurve"),
+        ccurve                  = join(qc_dir, "{name}.ccurve"),
     params:
-        rname = "preseq",
-        preseqver=config['tools']['PRESEQVER'],
-    shell: """
-    module load {params.preseqver};
-    preseq c_curve \\
-        -B \\
-        -o {output.ccurve} \\
-        {input.bam}            
-    """
+        rname                   = "preseq",
+        preseqver               = config['tools']['PRESEQVER'],
+    shell: 
+        """
+        module load {params.preseqver};
+        preseq c_curve \\
+            -B \\
+            -o {output.ccurve} \\
+            {input.bam}            
+        """
 
 
 rule NRF:
@@ -46,34 +61,35 @@ rule NRF:
         PBC1 = one_pair/distinct_reads, and PBC2 = one_pair/two_pair.
     """
     input:
-        bam=join(workpath,bam_dir,"{name}.sorted.bam"),
+        bam                     = join(bam_dir, "{name}.sorted.bam"),
     output:
-        preseq=join(workpath,qc_dir,"{name}.preseq.dat"),
-        preseqlog=join(workpath,qc_dir,"{name}.preseq.log"),
-        nrf=temp(join(workpath,qc_dir,"{name}.nrf")),
+        preseq                  = join(qc_dir, "{name}.preseq.dat"),
+        preseqlog               = join(qc_dir, "{name}.preseq.log"),
+        nrf                     = temp(join(qc_dir, "{name}.nrf")),
     params:
-        rname='NRF',
-        samtoolsver=config['tools']['SAMTOOLSVER'],
-        rver=config['tools']['RVER'],
-        preseqver=config['tools']['PRESEQVER'],
-        nrfscript=join(workpath,"workflow","scripts","atac_nrf.py "),
+        rname                   = 'NRF',
+        samtoolsver             = config['tools']['SAMTOOLSVER'],
+        rver                    = config['tools']['RVER'],
+        preseqver               = config['tools']['PRESEQVER'],
+        nrfscript               = join(workpath, "workflow", "scripts", "atac_nrf.py"),
     threads: 16
-    shell: """
-    module load {params.preseqver};
-    preseq lc_extrap \\
-        -P \\
-        -B \\
-        -D \\
-        -o {output.preseq} \\
-        {input.bam} \\
-        -seed 12345 \\
-        -v \\
-        -l 100000000000 \\
-    2> {output.preseqlog}
-    python {params.nrfscript} \\
-        {output.preseqlog} \\
-    > {output.nrf}
-    """
+    shell: 
+        """
+        module load {params.preseqver};
+        preseq lc_extrap \\
+            -P \\
+            -B \\
+            -D \\
+            -o {output.preseq} \\
+            {input.bam} \\
+            -seed 12345 \\
+            -v \\
+            -l 100000000000 \\
+        2> {output.preseqlog}
+        python {params.nrfscript} \\
+            {output.preseqlog} \\
+        > {output.nrf}
+        """
 
 
 rule rawfastqc:
@@ -87,44 +103,34 @@ rule rawfastqc:
         FastQC report and zip file containing data quality information
     """
     input:
-        expand(join(workpath,"{name}.R1.fastq.gz"), name=samples) if \
-            not paired_end else \
-            expand(join(workpath,"{name}.R{rn}.fastq.gz"), name=samples,rn=[1,2])
+        expand(join(workpath, "{name}.R{rn}.fastq.gz"), name=samples, rn=list(map(str, ends)))
     output:
-        expand(join(workpath,'rawfastQC',"{name}.R1_fastqc.html"),name=samples),
+        expand(join(qc_dir, "rawfastQC", "{name}.R{rn}_fastqc.html"), name=samples, rn=ends),
     params:
-        rname='rawfastqc',
-        outdir=join(workpath,"rawfastQC"),
-        tmpdir=tmpdir,
+        rname                   = 'rawfastqc',
+        outdir                  = join(qc_dir, "rawfastQC"),
+        tmpdir                  = tmpdir,
     envmodules: 
         config['tools']['FASTQCVER']
     threads:
         int(allocated("threads", "rawfastqc", cluster))
-    shell: """
-    # Setups temporary directory for
-    # intermediate files with built-in 
-    # mechanism for deletion on exit
-    if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi
-    tmp=$(mktemp -d -p "{params.tmpdir}")
-    trap 'rm -rf "${{tmp}}"' EXIT
+    shell: 
+        """
+        # fastqc storage on lscratch b/c nfs bug
+        if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi
+        tmp=$(mktemp -d -p "{params.tmpdir}")
+        trap 'rm -rf "${{tmp}}"' EXIT
 
-    # Running fastqc with local
-    # disk or a tmpdir, fastqc
-    # has been observed to lock
-    # up gpfs filesystems, adding
-    # this on request by HPC staff
-    fastqc \\
-        {input} \\
-        -t {threads} \\
-        -o "${{tmp}}"
-    
-    # Copy output files from tmpdir
-    # to output directory
-    find "${{tmp}}" \\
-        -type f \\
-        \\( -name '*.html' -o -name '*.zip' \\) \\
-        -exec cp {{}} {params.outdir} \\; 
-    """
+        fastqc \\
+            {input} \\
+            -t {threads} \\
+            -o "${{tmp}}"
+            
+        find "${{tmp}}" \\
+            -type f \\
+            \\( -name '*.html' -o -name '*.zip' \\) \\
+            -exec cp {{}} {params.outdir} \\; 
+        """
 
 
 rule fastqc:
@@ -138,15 +144,13 @@ rule fastqc:
         Trimmed FastQC reports and zip file containing data quality information
     """
     input:
-        expand(join(workpath,trim_dir,"{name}.R1.trim.fastq.gz"),name=samples) if \
-            not paired_end else \
-            expand(join(workpath,trim_dir,"{name}.R{rn}.trim.fastq.gz"), name=samples,rn=[1,2])
+        expand(join(trim_dir, "{name}.R{rn}.trim.fastq.gz"), name=samples, rn=ends)
     output:
-        expand(join(workpath,'fastQC',"{name}.R1.trim_fastqc.html"),name=samples),
+        expand(join(qc_dir, 'fastQC', "{name}.R{rn}.trim_fastqc.html"), name=samples, rn=ends),
     params:
-        rname='fastqc',
-        outdir=join(workpath,"fastQC"),
-        tmpdir=tmpdir,
+        rname                   = 'fastqc',
+        outdir                  = join(qc_dir, "fastQC"),
+        tmpdir                  = tmpdir,
     envmodules: 
         config['tools']['FASTQCVER']
     threads:
@@ -189,51 +193,42 @@ rule fastq_screen:
         FastQ Screen report and logfiles
     """
     input:
-        join(workpath,trim_dir,"{name}.R1.trim.fastq.gz") if not paired_end else \
-            expand(join(workpath,trim_dir,"{name}.R{rn}.trim.fastq.gz"),name=samples,rn=[1,2])
+        expand(join(trim_dir, "{name}.R{rn}.trim.fastq.gz"), name=samples, rn=ends)
     output:
-        join(workpath,"FQscreen","{name}.R1.trim_screen.txt") if not paired_end else \
-            expand(join(workpath,"FQscreen","{name}.R{rn}.trim_screen.txt"),name=samples,rn=[1,2]),
-        join(workpath,"FQscreen","{name}.R1.trim_screen.png") if not paired_end else \
-            expand(join(workpath,"FQscreen","{name}.R{rn}.trim_screen.png"),name=samples,rn=[1,2]),
-        join(workpath,"FQscreen2","{name}.R1.trim_screen.txt") if not paired_end else \
-            expand(join(workpath,"FQscreen2","{name}.R{rn}.trim_screen.txt"),name=samples,rn=[1,2]),
-        join(workpath,"FQscreen2","{name}.R1.trim_screen.png") if not paired_end else \
-            expand(join(workpath,"FQscreen2","{name}.R{rn}.trim_screen.png"),name=samples,rn=[1,2]),
+        get_fqscreen_outputs(paired_end, samples, qc_dir)
     params:
-        rname   = 'fqscreen',
-        outdir  = join(workpath,"FQscreen"),
-        outdir2 = join(workpath,"FQscreen2"),
-        # Exposed Parameters: modify resources/fastq_screen{_2}.conf 
-        # to change defaults locations to bowtie2 indices
-        fastq_screen         = config['bin']['FASTQ_SCREEN'],
-        fastq_screen_config1 = config['shared_resources']['FASTQ_SCREEN_CONFIG_P1'],
-        fastq_screen_config2 = config['shared_resources']['FASTQ_SCREEN_CONFIG_P2'],
+        rname                   = 'fqscreen',
+        outdir                  = join(qc_dir, "FQscreen"),
+        outdir2                 = join(qc_dir, "FQscreen2"),
+        fastq_screen            = config['bin']['FASTQ_SCREEN'],
+        fastq_screen_config1    = config['shared_resources']['FASTQ_SCREEN_CONFIG_P1'],
+        fastq_screen_config2    = config['shared_resources']['FASTQ_SCREEN_CONFIG_P2'],
     envmodules:
         config['tools']['BOWTIE2VER'],
         config['tools']['PERLVER'],
     threads: 
         int(allocated("threads", "fastq_screen", cluster))
-    shell: """
-    # First pass of contamination screening
-    {params.fastq_screen} \\
-        --conf {params.fastq_screen_config1} \\
-        --outdir {params.outdir} \\
-        --threads {threads} \\
-        --subset 1000000 \\
-        --aligner bowtie2 \\
-        --force \\
-        {input}
-    # Second pass of contamination screening
-    {params.fastq_screen} \\
-        --conf {params.fastq_screen_config2} \\
-        --outdir {params.outdir2} \\
-        --threads {threads} \\
-        --subset 1000000 \\
-        --aligner bowtie2 \\
-        --force \\
-        {input}
-    """
+    shell: 
+        """
+        # First pass of contamination screening
+        {params.fastq_screen} \\
+            --conf {params.fastq_screen_config1} \\
+            --outdir {params.outdir} \\
+            --threads {threads} \\
+            --subset 1000000 \\
+            --aligner bowtie2 \\
+            --force \\
+            {input}
+        # Second pass of contamination screening
+        {params.fastq_screen} \\
+            --conf {params.fastq_screen_config2} \\
+            --outdir {params.outdir2} \\
+            --threads {threads} \\
+            --subset 1000000 \\
+            --aligner bowtie2 \\
+            --force \\
+            {input}
+        """
 
 rule kraken:
     """
@@ -247,52 +242,54 @@ rule kraken:
         Kraken logfile and interative krona report
     """
     input:
-        fq1=join(workpath,trim_dir,"{name}.R1.trim.fastq.gz"),
-        fq2=provided(join(workpath,trim_dir,"{name}.R2.trim.fastq.gz"), paired_end)
+        fq1                     = join(trim_dir, "{name}.R1.trim.fastq.gz"),
+        fq2                     = provided(join(trim_dir,"{name}.R2.trim.fastq.gz"), paired_end)
     output:
-        krakenout = join(workpath,kraken_dir,"{name}.trim.kraken_bacteria.out.txt"),
-        krakentaxa = join(workpath,kraken_dir,"{name}.trim.kraken_bacteria.taxa.txt"),
-        kronahtml = join(workpath,kraken_dir,"{name}.trim.kraken_bacteria.krona.html"),
+        krakenout               = join(kraken_dir, "{name}.trim.kraken_bacteria.out.txt"),
+        krakentaxa              = join(kraken_dir, "{name}.trim.kraken_bacteria.taxa.txt"),
+        kronahtml               = join(kraken_dir, "{name}.trim.kraken_bacteria.krona.html"),
     params:
-        rname='kraken',
-        outdir=join(workpath,kraken_dir),
-        bacdb=config['shared_resources']['KRAKENBACDB'],
-        tmpdir=tmpdir,
-        paired_end = paired_end
-    threads: int(allocated("threads", "kraken_pe", cluster)),
+        rname                   = 'kraken',
+        outdir                  = kraken_dir,
+        bacdb                   = config['shared_resources']['KRAKENBACDB'],
+        tmpdir                  = tmpdir,
+        paired_end              = paired_end
+    threads: 
+        int(allocated("threads", "kraken_pe", cluster)),
     envmodules:
         config['tools']['KRAKENVER'],
         config['tools']['KRONATOOLSVER'],
-    shell: """
-    # Setups temporary directory for
-    # intermediate files with built-in 
-    # mechanism for deletion on exit
-    if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi
-    tmp=$(mktemp -d -p "{params.tmpdir}")
-    trap 'rm -rf "${{tmp}}"' EXIT
-    
-    # Copy kraken2 db to /lscratch or temp 
-    # location to reduce filesystem strain
-    cp -rv {params.bacdb} ${{tmp}}/;
-    kdb_base=$(basename {params.bacdb})
-    if [ '{params.paired_end}' == True ]; then
-        kraken2 --db ${{tmp}}/${{kdb_base}} \\
-            --threads {threads} --report {output.krakentaxa} \\
-            --output {output.krakenout} \\
-            --gzip-compressed \\
-            --paired {input.fq1} {input.fq2}
-    else
-        kraken2 --db ${{tmp}}/${{kdb_base}} \\
-            --threads {threads} --report {output.krakentaxa} \\
-            --output {output.krakenout} \\
-            --gzip-compressed \\
-            {input.fq1}
-    fi
-    
-    # Generate Krona Report
-    cut -f2,3 {output.krakenout} | \\
-        ktImportTaxonomy - -o {output.kronahtml}
-    """
+    shell: 
+        """
+        # Setups temporary directory for
+        # intermediate files with built-in 
+        # mechanism for deletion on exit
+        if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi
+        tmp=$(mktemp -d -p "{params.tmpdir}")
+        trap 'rm -rf "${{tmp}}"' EXIT
+        
+        # Copy kraken2 db to /lscratch or temp 
+        # location to reduce filesystem strain
+        cp -rv {params.bacdb} ${{tmp}}/;
+        kdb_base=$(basename {params.bacdb})
+        if [ '{params.paired_end}' == True ]; then
+            kraken2 --db ${{tmp}}/${{kdb_base}} \\
+                --threads {threads} --report {output.krakentaxa} \\
+                --output {output.krakenout} \\
+                --gzip-compressed \\
+                --paired {input.fq1} {input.fq2}
+        else
+            kraken2 --db ${{tmp}}/${{kdb_base}} \\
+                --threads {threads} --report {output.krakentaxa} \\
+                --output {output.krakenout} \\
+                --gzip-compressed \\
+                {input.fq1}
+        fi
+        
+        # Generate Krona Report
+        cut -f2,3 {output.krakenout} | \\
+            ktImportTaxonomy - -o {output.kronahtml}
+        """
 
 rule multiqc:
     """
@@ -306,25 +303,22 @@ rule multiqc:
         Interactive MulitQC report and a QC metadata table
     """
     input: 
-        expand(join(workpath,"FQscreen","{name}.R1.trim_screen.txt"),name=samples),
-        expand(join(workpath,"FQscreen2","{name}.R1.trim_screen.txt"),name=samples),
-        expand(join(workpath,kraken_dir,"{name}.trim.kraken_bacteria.krona.html"),name=samples),
-        expand(join(workpath,qc_dir,"{name}.ccurve"), name=samples),
-        expand(join(workpath,bam_dir,"{name}.Q5DD.bam.flagstat"), name=samples),
-        expand(join(workpath,bam_dir,"{name}.Q5.bam.flagstat"), name=samples),
-        # join(workpath,qc_dir,"QCTable.txt"),
-        expand(join(workpath,"rawfastQC","{name}.R1_fastqc.html"),name=samples),
-        expand(join(workpath,"fastQC","{name}.R1.trim_fastqc.html"),name=samples),
-        # expand(join(workpath,deeptools_dir,"{group}.fingerprint.raw.Q5DD.tab"),group=groups),
-        join(workpath,deeptools_dir,"spearman_heatmap.Q5DD_mqc.png")
+        expand(join(qc_dir, "FQscreen", "{name}.R1.trim_screen.txt"), name=samples),
+        expand(join(qc_dir, "FQscreen2", "{name}.R1.trim_screen.txt"), name=samples),
+        expand(join(qc_dir, "{name}.ccurve"), name=samples),
+        expand(join(qc_dir, "rawfastQC", "{name}.R1_fastqc.html"), name=samples),
+        expand(join(qc_dir, "fastQC", "{name}.R1.trim_fastqc.html"), name=samples),
+        expand(join(kraken_dir, "{name}.trim.kraken_bacteria.krona.html"), name=samples),
+        expand(join(bam_dir, "{name}.Q5DD.bam.flagstat"), name=samples),
+        expand(join(bam_dir, "{name}.Q5.bam.flagstat"), name=samples),
+        join(deeptools_dir, "spearman_heatmap.Q5DD_mqc.png")
     output:
-        join(workpath,"multiqc_report.html")
+        join(workpath, "multiqc_report.html")
     params:
-        rname="multiqc",
-        multiqc=config['tools']['MULTIQCVER'],
-	    qcconfig=join(workpath, config['shared_resources']['MULTIQC_CONFIG']),
-	    excludedir=join(workpath,extra_fingerprint_dir),
-        dir=workpath
+        rname                   = "multiqc",
+        multiqc                 = config['tools']['MULTIQCVER'],
+	    qcconfig                = join(workpath, config['shared_resources']['MULTIQC_CONFIG']),
+	    excludedir              = join(workpath, extra_fingerprint_dir),
     shell: """
     module load {params.multiqc}
     multiqc \\
@@ -333,9 +327,10 @@ rule multiqc:
         --interactive \\
         -e cutadapt \\
         --ignore {params.excludedir} \\
-        -d {params.dir}
+        -d """ + workpath + """
     """
 
+
 rule insert_size:
     """
     Quality step calculates number of reads per insert size.
@@ -347,31 +342,32 @@ rule insert_size:
         Number of reads per insert size and their histogram
     """
     input:
-        bam = lambda w : join(workpath,bam_dir,w.name + "." + w.ext + "." + extensionsDict[w.ext])
+        bam                     = lambda w : join(bam_dir, w.name + "." + w.ext + "." + get_bam_ext(w.ext, paired_end))
     output:
-        txt= join(workpath,qc_dir,"{name}.{ext}.insert_size_metrics.txt"),
-        pdf= join(workpath,qc_dir,"{name}.{ext}.insert_size_histogram.pdf"),
+        txt                     = join(qc_dir, "{name}.{ext}.insert_size_metrics.txt"),
+        pdf                     = join(qc_dir, "{name}.{ext}.insert_size_histogram.pdf"),
     params:
-        rname="insert_size",
-        picardver=config['tools']['PICARDVER'],
-        rver=config['tools']['RVER'],
-        javaram='16g',
-    shell: """
-    module load {params.picardver} {params.rver};
-    java -Xmx{params.javaram} -jar ${{PICARDJARPATH}}/picard.jar CollectInsertSizeMetrics \\
-        -I {input.bam} \\
-        -O {output.txt} \\
-        -H {output.pdf}
-    """
+        rname                   = "insert_size",
+        picardver               = config['tools']['PICARDVER'],
+        rver                    = config['tools']['RVER'],
+        javaram                 = '16g',
+    shell: 
+        """
+        module load {params.picardver} {params.rver};
+        java -Xmx{params.javaram} -jar ${{PICARDJARPATH}}/picard.jar CollectInsertSizeMetrics \\
+            -I {input.bam} \\
+            -O {output.txt} \\
+            -H {output.pdf}
+        """
 
 rule deeptools_QC:
     input:
         [ join(workpath, bw_dir, name + ".Q5DD.RPGC.bw") for name in samples ] # this should be all bigwigs
     output:
-        heatmap=join(workpath,deeptools_dir,"spearman_heatmap.Q5DD.pdf"),
-        pca=join(workpath,deeptools_dir,"pca.Q5DD.pdf"),
-	    npz=temp(join(workpath,deeptools_dir,"Q5DD.npz")),
-	    png=join(workpath,deeptools_dir,"spearman_heatmap.Q5DD_mqc.png")
+        heatmap=join(deeptools_dir, "spearman_heatmap.Q5DD.pdf"),
+        pca=join(deeptools_dir, "pca.Q5DD.pdf"),
+	    npz=temp(join(deeptools_dir, "Q5DD.npz")),
+	    png=join(deeptools_dir, "spearman_heatmap.Q5DD_mqc.png")
     params:
         rname="deeptools_QC",
         deeptoolsver=config['tools']['DEEPTOOLSVER'],
@@ -384,3 +380,49 @@ rule deeptools_QC:
     plotPCA -in {output.npz} -o {output.pca}
     """
 
+rule FRiP:
+    input:
+        bed = lambda w: [ join(workpath, w.PeakTool, chip, chip + PeakExtensions[w.PeakTool]) for chip in chips ],
+        bam = join(bam_dir, "{name}.Q5DD.bam"),
+    output:
+        join(workpath,"PeakQC","{PeakTool}.{name}.Q5DD.FRiP_table.txt"),
+    params:
+        rname="frip",
+        outroot = lambda w: join(workpath,"PeakQC",w.PeakTool),
+        script=join(workpath,"workflow","scripts","frip.py"),
+        genome = config['references'][genome]['REFLEN'],
+        tmpdir = tmpdir,
+    container: config['images']['python']
+    shell: """
+    # Setups temporary directory for
+    # intermediate files with built-in 
+    # mechanism for deletion on exit
+    if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi
+    tmp=$(mktemp -d -p "{params.tmpdir}")
+    trap 'rm -rf "${{tmp}}"' EXIT
+
+    python {params.script} \\
+        -p {input.bed} \\
+        -b {input.bam} \\
+        -g {params.genome} \\
+        -o {params.outroot}
+    """
+
+rule jaccard:
+    input:
+        lambda w: [ join(workpath, w.PeakTool, chip, chip + PeakExtensions[w.PeakTool]) for chip in chips ],
+    output:
+        join(qc_dir, '{PeakTool}_jaccard.txt'),
+    params:
+        rname="jaccard",
+        outroot = lambda w: join(qc_dir, w.PeakTool),
+        script=join(workpath,"workflow","scripts","jaccard_score.py"),
+        genome = config['references'][genome]['REFLEN']
+    envmodules:
+        config['tools']['BEDTOOLSVER']
+    shell: """
+    python {params.script} \\
+        -i "{input}" \\
+        -o "{params.outroot}" \\
+        -g {params.genome}
+    """
diff --git a/workflow/rules/trim_align_dedup.smk b/workflow/rules/trim_align_dedup.smk
index 078701b..0f1b458 100644
--- a/workflow/rules/trim_align_dedup.smk
+++ b/workflow/rules/trim_align_dedup.smk
@@ -1,63 +1,28 @@
-# Common paired-end rules:
-#   - trim_pe
-#   - kraken_pe
-#   - BWA_PE
-#   - insert_size
-
-def dedup_out7(input, assay, paired_end):
-    if assay == "cfchip":
-        i = [
-            
-            input+".Q5DD_tagAlign"
-        ]
-        return i 
-    elif paired_end == False and assay == "chip":
-        i = [
-            input+".Q5DD_tagAlign.gz"
-        ]
-        return i
-    else:
-        i = []
-        return i
+# Trimming, alignment, and redundancy reduction rules
+# ~~~~
+# Common paired-end rules: trim_pe, kraken_pe, BWA_PE, insert_size
+import snakemake
+from os.path import join
+from scripts.common import allocated, get_bam_ext
+from scripts.grouping import dedup_out7, get_bam_input, get_ppqt_input
+from scripts.blocking import ctrl_test
 
+# ~~ workflow configuration
+workpath                        = config['project']['workpath']
+genome                          = config['options']['genome']
+paired_end                      = False if config['project']['nends'] == 1 else True
+ends                            = [1] if not paired_end else [1, 2]
+chip2input                      = config['project']['peaks']['inputs']
 
-def get_ppqt_input(wildcards):
-    if paired_end:
-        i = [
-            join(workpath, bam_dir, ppqt_dir, "{0}.{1}.ppqt.txt".format(wildcards.name, wildcards.ext))
-        ]
-        return i 
-    else:
-        if wildcards.ext == "Q5DD":
-            i = [
-                join(workpath, bam_dir, ppqt_dir, "{0}.Q5DD_tagAlign.ppqt.txt".format(wildcards.name))
-            ]
-            return i 
-        elif wildcards.ext == "sorted":
-            i = [
-                join(workpath, bam_dir, ppqt_dir, "{0}.sorted.ppqt.txt".format(wildcards.name))
-            ]
-            return i
+# ~~ directories
+trim_dir                        = join(workpath, 'trim')
+tmpdir                          = config['options']['tmp_dir']
+bam_dir                         = join(workpath, "bam")
+bw_dir                          = join(workpath, "bigwig")
+qc_dir                          = join(workpath, "QC")
+ppqt_dir                        = join(bam_dir, "ppqt")
 
 
-def get_bam_input(wildcards):
-    if paired_end:
-        i = [
-            join(workpath, bam_dir, "{0}.{1}.bam".format(wildcards.name, wildcards.ext))
-        ]
-        return i 
-    else:
-        if wildcards.ext == "Q5DD":
-            i = [
-                join(workpath, bam_dir, "{0}.Q5DD.bam".format(wildcards.name))
-            ]
-            return i
-        elif wildcards.ext == "sorted":
-            i = [
-                join(workpath, bam_dir, "{0}.sorted.bam".format(wildcards.name))
-            ]
-            return i
-        
 rule trim:
     """
     Data-processing step to remove adapter sequences and perform quality trimming
@@ -76,119 +41,119 @@ rule trim:
         Trimmed and blacklist-sequences-free FastQ files
     """
     input:
-        file1=join(workpath,"{name}.R1.fastq.gz"),
-        file2=provided(join(workpath,"{name}.R2.fastq.gz"),paired_end)
+        file1                               = join(workpath, "{name}.R1.fastq.gz"),
+        file2                               = provided(join(workpath,"{name}.R2.fastq.gz"), paired_end)
     output:
-        outfq1=temp(join(workpath,trim_dir,"{name}.R1.trim.fastq.gz")),
-        outfq2=provided(temp(join(workpath,trim_dir,"{name}.R2.trim.fastq.gz")),paired_end)
+        outfq1                              = temp(join(trim_dir, "{name}.R1.trim.fastq.gz")),
+        outfq2                              = provided(temp(join(trim_dir, "{name}.R2.trim.fastq.gz")), paired_end)
     params:
-        rname="trim",
-        cutadaptver=config['tools']['CUTADAPTVER'],
-        workpath=config['project']['workpath'],
-        fastawithadaptersetd=join(workpath, config['shared_resources']['ADAPTERS_FASTA']),
-        blacklistbwaindex=config['references'][genome]['BLACKLISTBWAINDEX'],
-        picardver=config['tools']['PICARDVER'],
-        bwaver=config['tools']['BWAVER'],
-        samtoolsver=config['tools']['SAMTOOLSVER'],
-        minlen=35,
-        leadingquality=10,
-        trailingquality=10,
-        javaram="64g",
-        sample="{name}",
-        tmpdir=tmpdir,
-        paired_end = paired_end
-    threads: 16
-    shell: """
-    module load {params.cutadaptver};
-    module load {params.bwaver};
-    module load {params.samtoolsver};
-    module load {params.picardver};
-    # Setups temporary directory for
-    # intermediate files with built-in 
-    # mechanism for deletion on exit
-    if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi
-    tmp=$(mktemp -d -p "{params.tmpdir}")
-    trap 'rm -rf "${{tmp}}"' EXIT
+        rname                               = "trim",
+        cutadaptver                         = config['tools']['CUTADAPTVER'],
+        workpath                            = config['project']['workpath'],
+        fastawithadaptersetd                = join(workpath, config['shared_resources']['ADAPTERS_FASTA']),
+        blacklistbwaindex                   = config['references'][genome]['BLACKLISTBWAINDEX'],
+        picardver                           = config['tools']['PICARDVER'],
+        bwaver                              = config['tools']['BWAVER'],
+        samtoolsver                         = config['tools']['SAMTOOLSVER'],
+        minlen                              = 35,
+        leadingquality                      = 10,
+        trailingquality                     = 10,
+        javaram                             = "64g",
+        sample                              = "{name}",
+        tmpdir                              = tmpdir,
+        paired_end                          = paired_end
+    threads: 
+        16
+    shell: 
+        """
+        module load {params.cutadaptver};
+        module load {params.bwaver};
+        module load {params.samtoolsver};
+        module load {params.picardver};
+        if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi
+        tmp=$(mktemp -d -p "{params.tmpdir}")
+        trap 'rm -rf "${{tmp}}"' EXIT
 
-    if [ '{params.paired_end}' == True ];then
-        cutadapt \\
-            --pair-filter=any \\
-            --nextseq-trim=2 \\
-            --trim-n \\
-            -n 5 \\
-            -O 5 \\
-            -q {params.leadingquality},{params.trailingquality} \\
-            -m {params.minlen}:{params.minlen} \\
-            -b file:{params.fastawithadaptersetd} \\
-            -B file:{params.fastawithadaptersetd} \\
-            -j {threads} \\
-            -o ${{tmp}}/{params.sample}.R1.trim.fastq.gz \\
-            -p ${{tmp}}/{params.sample}.R2.trim.fastq.gz \\
-            {input.file1} {input.file2}
-        
-        if [ "{params.blacklistbwaindex}" != "" ];
-        then bwa mem -t {threads} \\
-            {params.blacklistbwaindex} \\
-            ${{tmp}}/{params.sample}.R1.trim.fastq.gz \\
-            ${{tmp}}/{params.sample}.R2.trim.fastq.gz \\
-            | samtools view -@{threads} \\
-                -f4 \\
-                -b \\
-                -o ${{tmp}}/{params.sample}.bam;
-        rm ${{tmp}}/{params.sample}.R1.trim.fastq.gz;
-        rm ${{tmp}}/{params.sample}.R2.trim.fastq.gz;
-        
-        java -Xmx{params.javaram} -jar $PICARDJARPATH/picard.jar SamToFastq \\
-            -VALIDATION_STRINGENCY SILENT \\
-            -INPUT ${{tmp}}/{params.sample}.bam \\
-            -FASTQ ${{tmp}}/{params.sample}.R1.trim.fastq \\
-            -SECOND_END_FASTQ ${{tmp}}/{params.sample}.R2.trim.fastq \\
-            -UNPAIRED_FASTQ ${{tmp}}/{params.sample}.unpaired.noBL.fastq
+        if [ '{params.paired_end}' == True ]; then
+            cutadapt \\
+                --pair-filter=any \\
+                --nextseq-trim=2 \\
+                --trim-n \\
+                -n 5 \\
+                -O 5 \\
+                -q {params.leadingquality},{params.trailingquality} \\
+                -m {params.minlen}:{params.minlen} \\
+                -b file:{params.fastawithadaptersetd} \\
+                -B file:{params.fastawithadaptersetd} \\
+                -j {threads} \\
+                -o ${{tmp}}/{params.sample}.R1.trim.fastq.gz \\
+                -p ${{tmp}}/{params.sample}.R2.trim.fastq.gz \\
+                {input.file1} {input.file2}
             
-        rm ${{tmp}}/{params.sample}.bam;
-        
-        pigz -p {threads} ${{tmp}}/{params.sample}.R1.trim.fastq;
-        pigz -p {threads} ${{tmp}}/{params.sample}.R2.trim.fastq;
-        fi
-        mv ${{tmp}}/{params.sample}.R1.trim.fastq.gz {output.outfq1};
-        mv ${{tmp}}/{params.sample}.R2.trim.fastq.gz {output.outfq2};
-    else
-        cutadapt \\
-            --nextseq-trim=2 \\
-            --trim-n \\
-            -n 5 \\
-            -O 5 \\
-            -q {params.leadingquality},{params.trailingquality} \\
-            -m {params.minlen} \\
-            -b file:{params.fastawithadaptersetd} \\
-            -j {threads} \\
-            -o ${{tmp}}/{params.sample}.R1.trim.fastq.gz \\
-            {input.file1}
-        
-        if [ "{params.blacklistbwaindex}" != "" ];
-        then bwa mem -t {threads} \\
-            {params.blacklistbwaindex} \\
-            ${{tmp}}/{params.sample}.R1.trim.fastq.gz \\
-            | samtools view -@{threads} \\
-                -f4 \\
-                -b \\
-                -o ${{tmp}}/{params.sample}.bam;
-        rm ${{tmp}}/{params.sample}.R1.trim.fastq.gz;
-        
-        java -Xmx{params.javaram} -jar $PICARDJARPATH/picard.jar SamToFastq \\
-            -VALIDATION_STRINGENCY SILENT \\
-            -INPUT ${{tmp}}/{params.sample}.bam \\
-            -FASTQ ${{tmp}}/{params.sample}.R1.trim.fastq
+            if [ "{params.blacklistbwaindex}" != "" ];
+            then bwa mem -t {threads} \\
+                {params.blacklistbwaindex} \\
+                ${{tmp}}/{params.sample}.R1.trim.fastq.gz \\
+                ${{tmp}}/{params.sample}.R2.trim.fastq.gz \\
+                | samtools view -@{threads} \\
+                    -f4 \\
+                    -b \\
+                    -o ${{tmp}}/{params.sample}.bam;
+            rm ${{tmp}}/{params.sample}.R1.trim.fastq.gz;
+            rm ${{tmp}}/{params.sample}.R2.trim.fastq.gz;
             
-        rm ${{tmp}}/{params.sample}.bam;
-        
-        pigz -p {threads} ${{tmp}}/{params.sample}.R1.trim.fastq;
+            java -Xmx{params.javaram} -jar $PICARDJARPATH/picard.jar SamToFastq \\
+                -VALIDATION_STRINGENCY SILENT \\
+                -INPUT ${{tmp}}/{params.sample}.bam \\
+                -FASTQ ${{tmp}}/{params.sample}.R1.trim.fastq \\
+                -SECOND_END_FASTQ ${{tmp}}/{params.sample}.R2.trim.fastq \\
+                -UNPAIRED_FASTQ ${{tmp}}/{params.sample}.unpaired.noBL.fastq
+                
+            rm ${{tmp}}/{params.sample}.bam;
+            
+            pigz -p {threads} ${{tmp}}/{params.sample}.R1.trim.fastq;
+            pigz -p {threads} ${{tmp}}/{params.sample}.R2.trim.fastq;
+            fi
+            mv ${{tmp}}/{params.sample}.R1.trim.fastq.gz {output.outfq1};
+            mv ${{tmp}}/{params.sample}.R2.trim.fastq.gz {output.outfq2};
+        else
+            cutadapt \\
+                --nextseq-trim=2 \\
+                --trim-n \\
+                -n 5 \\
+                -O 5 \\
+                -q {params.leadingquality},{params.trailingquality} \\
+                -m {params.minlen} \\
+                -b file:{params.fastawithadaptersetd} \\
+                -j {threads} \\
+                -o ${{tmp}}/{params.sample}.R1.trim.fastq.gz \\
+                {input.file1}
+            
+            if [ "{params.blacklistbwaindex}" != "" ];
+            then bwa mem -t {threads} \\
+                {params.blacklistbwaindex} \\
+                ${{tmp}}/{params.sample}.R1.trim.fastq.gz \\
+                | samtools view -@{threads} \\
+                    -f4 \\
+                    -b \\
+                    -o ${{tmp}}/{params.sample}.bam;
+            rm ${{tmp}}/{params.sample}.R1.trim.fastq.gz;
+            
+            java -Xmx{params.javaram} -jar $PICARDJARPATH/picard.jar SamToFastq \\
+                -VALIDATION_STRINGENCY SILENT \\
+                -INPUT ${{tmp}}/{params.sample}.bam \\
+                -FASTQ ${{tmp}}/{params.sample}.R1.trim.fastq
+                
+            rm ${{tmp}}/{params.sample}.bam;
+            
+            pigz -p {threads} ${{tmp}}/{params.sample}.R1.trim.fastq;
 
+            fi
+            mv ${{tmp}}/{params.sample}.R1.trim.fastq.gz {output.outfq1};
         fi
-        mv ${{tmp}}/{params.sample}.R1.trim.fastq.gz {output.outfq1};
-    fi
         """
 
+
 rule BWA:
     """
     Data processing rule to align trimmed and blacklist-sequences-free reads 
@@ -205,56 +170,54 @@ rule BWA:
         Bam file that has reads aligned and filted by mapQ a value: Q5.bam
     """
     input:
-        infq1 = join(workpath,trim_dir,"{name}.R1.trim.fastq.gz"),
-        infq2 = provided(join(workpath,trim_dir,"{name}.R2.trim.fastq.gz"), paired_end)
+        infq1                               = join(trim_dir, "{name}.R1.trim.fastq.gz"),
+        infq2                               = join(trim_dir, "{name}.R2.trim.fastq.gz"),
     params:
-        d=join(workpath,bam_dir),
-        rname='bwa',
-        reference=config['references'][genome]['BWA'],
-        bwaver=config['tools']['BWAVER'],
-        samtoolsver=config['tools']['SAMTOOLSVER'],
-        script=join(workpath,"workflow","scripts","bam_filter_by_mapq.py"),
-        pythonver=config['tools']['PYTHONVER'],
-        paired_end = paired_end
+        d                                   = join(bam_dir),
+        rname                               = 'bwa',
+        reference                           = config['references'][genome]['BWA'],
+        bwaver                              = config['tools']['BWAVER'],
+        samtoolsver                         = config['tools']['SAMTOOLSVER'],
+        script                              = join(workpath, "bin", "bam_filter_by_mapq.py"),
+        pythonver                           = config['tools']['PYTHONVER'],
     output:
-        outbam1=join(workpath,bam_dir,"{name}.sorted.bam"), 
-        outbam2=temp(join(join(workpath,bam_dir,"{name}.Q5.bam"))),
-        flagstat1=join(workpath,bam_dir,"{name}.sorted.bam.flagstat"),
-        idxstat1=join(workpath,bam_dir,"{name}.sorted.bam.idxstat"),
-        flagstat2=join(workpath,bam_dir,"{name}.Q5.bam.flagstat"),
-        idxstat2=join(workpath,bam_dir,"{name}.Q5.bam.idxstat"),
+        outbam1                             = join(bam_dir, "{name}.sorted.bam"), 
+        outbam2                             = temp(join(bam_dir, "{name}.Q5.bam")),
+        flagstat1                           = join(bam_dir, "{name}.sorted.bam.flagstat"),
+        idxstat1                            = join(bam_dir, "{name}.sorted.bam.idxstat"),
+        flagstat2                           = join(bam_dir, "{name}.Q5.bam.flagstat"),
+        idxstat2                            = join(bam_dir, "{name}.Q5.bam.idxstat"),
     threads: 32
-    shell: """
-    module load {params.bwaver};
-    module load {params.samtoolsver};
-    module load {params.pythonver};
-    if [ '{params.paired_end}' == True ];then
-        bwa mem -t {threads} {params.reference} {input.infq1} {input.infq2} \\
-            | samtools sort -@{threads} -o {output.outbam1}
-        
-        samtools index {output.outbam1}
-        samtools flagstat {output.outbam1} > {output.flagstat1}
-        samtools idxstats {output.outbam1} > {output.idxstat1}
-        #samtools view -b -q 6 {output.outbam1} -o {output.outbam2}
-        
-        python {params.script} -i {output.outbam1} -o {output.outbam2} -q 6
-        samtools index {output.outbam2}
-        samtools flagstat {output.outbam2} > {output.flagstat2}
-        samtools idxstats {output.outbam2} > {output.idxstat2}
-    else
-        bwa mem -t {threads} {params.reference} {input.infq1} \\
-            | samtools sort -@{threads} -o {output.outbam1}
-        
-        samtools index {output.outbam1}
-        samtools flagstat {output.outbam1} > {output.flagstat1}
-        samtools idxstats {output.outbam1} > {output.idxstat1}
-        samtools view -b -q 6 {output.outbam1} -o {output.outbam2}
-        
-        samtools index {output.outbam2}
-        samtools flagstat {output.outbam2} > {output.flagstat2}
-        samtools idxstats {output.outbam2} > {output.idxstat2}
-    fi
-    """
+    shell: 
+        """
+        module load {params.bwaver};
+        module load {params.samtoolsver};
+        module load {params.pythonver};
+        if [ '""" + str(paired_end) + """' == True ];then
+            bwa mem -t {threads} {params.reference} {input.infq1} {input.infq2} \\
+                | samtools sort -@{threads} -o {output.outbam1}
+            
+            samtools index {output.outbam1}
+            samtools flagstat {output.outbam1} > {output.flagstat1}
+            samtools idxstats {output.outbam1} > {output.idxstat1}
+            
+            python {params.script} -i {output.outbam1} -o {output.outbam2} -q 6
+            samtools index {output.outbam2}
+            samtools flagstat {output.outbam2} > {output.flagstat2}
+            samtools idxstats {output.outbam2} > {output.idxstat2}
+        else
+            bwa mem -t {threads} {params.reference} {input.infq1} \\
+                | samtools sort -@{threads} -o {output.outbam1}
+            samtools index {output.outbam1}
+            samtools flagstat {output.outbam1} > {output.flagstat1}
+            samtools idxstats {output.outbam1} > {output.idxstat1}
+            samtools view -b -q 6 {output.outbam1} -o {output.outbam2}
+            
+            samtools index {output.outbam2}
+            samtools flagstat {output.outbam2} > {output.flagstat2}
+            samtools idxstats {output.outbam2} > {output.idxstat2}
+        fi
+        """
 
 rule dedup:
     """
@@ -273,125 +236,124 @@ rule dedup:
         Deduplicated Q5DD.bam for all assays, plus Q5DD.tagAlign if cfchip assay
 
     """
-    input: 
-        bam2=join(workpath,bam_dir,"{name}.Q5.bam")
+    input:
+        bam2                                = join(bam_dir,"{name}.Q5.bam")
     output:
-        out5=join(workpath,bam_dir,"{name}.Q5DD.bam"),
-        out5f=join(workpath,bam_dir,"{name}.Q5DD.bam.flagstat"),
-        out5i=join(workpath,bam_dir,"{name}.Q5DD.bam.idxstat"),
-        out6=provided(join(workpath,bam_dir,"{name}.bwa.Q5.duplic"), paired_end),
-        out7=dedup_out7(join(workpath,bam_dir,"{name}"), assay, paired_end)
+        out5                                = join(workpath,bam_dir,"{name}.Q5DD.bam"),
+        out5f                               = join(workpath,bam_dir,"{name}.Q5DD.bam.flagstat"),
+        out5i                               = join(workpath,bam_dir,"{name}.Q5DD.bam.idxstat"),
+        out6                                = provided(join(workpath,bam_dir,"{name}.bwa.Q5.duplic"), paired_end),
+        out7                                = dedup_out7(join(workpath,bam_dir,"{name}"), assay, paired_end)
     params:
-        rname='dedup',
-        picardver=config['tools']['PICARDVER'],
-        samtoolsver=config['tools']['SAMTOOLSVER'],
-        bedtoolsver=config['tools']['BEDTOOLSVER'],
-        macsver=config['tools']['MACSVER'],
-        gsize=config['references'][genome]['EFFECTIVEGENOMESIZE'],
-        folder=join(workpath,bam_dir),
-        genomefile=config['references'][genome]['REFLEN'],
-        rver=config['tools']['RVER'],
-        javaram='16g',
-        tmpdir=tmpdir,
-        tmpBam="{name}.Q5DD.withXY.bam",
-        rscript=join(config['references'][genome]['cfChIP_TOOLS_SRC'], "bam2fragment.R"),
-        paired_end = paired_end
-    shell: """
-    module load {params.samtoolsver};
-    module load {params.picardver};
-    module load {params.bedtoolsver};
-    module load {params.macsver};
-    module load {params.rver}; 
-    if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi
-    tmp=$(mktemp -d -p "{params.tmpdir}")
-    trap 'rm -rf "${{tmp}}"' EXIT
-    
-    if [ "{assay}" == "cfchip" ];then
-      java -Xmx{params.javaram} \\
-        -jar $PICARDJARPATH/picard.jar MarkDuplicates \\
-        -I {input.bam2} \\
-        -O {params.tmpBam} \\
-        -TMP_DIR ${{tmp}} \\
-        -VALIDATION_STRINGENCY SILENT \\
-        -REMOVE_DUPLICATES true \\
-        -METRICS_FILE {output.out6};
-      samtools index {params.tmpBam};
-      samtools view -b {params.tmpBam} chr{{1..22}} > {output.out5};
-      Rscript {params.rscript} {params.tmpBam} {output.out7};
-      rm {params.tmpBam} {params.tmpBam}.bai;
-      samtools index {output.out5};
-      samtools flagstat {output.out5} > {output.out5f};
-      samtools idxstats {output.out5} > {output.out5i}; 
-    elif [ '{params.paired_end}' == False ];then
-      macs2 filterdup -i {input} -g {params.gsize} --keep-dup="auto" -o ${{tmp}}/TmpTagAlign;
-      awk -F"\\t" -v OFS="\\t" '{{if ($2>0 && $3>0) {{print}}}}' ${{tmp}}/TmpTagAlign > ${{tmp}}/TmpTagAlign2;
-      awk -F"\\t" -v OFS="\\t" '{{print $1,1,$2}}' {params.genomefile} | sort -k1,1 -k2,2n > ${{tmp}}/GenomeFileBed;
-      bedtools intersect -wa -f 1.0 -a ${{tmp}}/TmpTagAlign2 -b ${{tmp}}/GenomeFileBed > ${{tmp}}/TmpTagAlign3;
-      bedtools bedtobam -i ${{tmp}}/TmpTagAlign3 -g {params.genomefile} | samtools sort -@4 -o {output.out5};
-      gzip ${{tmp}}/TmpTagAlign3;
-      mv ${{tmp}}/TmpTagAlign3.gz {output.out7};
-      samtools index {output.out5};
-      samtools flagstat {output.out5} > {output.out5f}
-      samtools idxstats {output.out5} > {output.out5i}
-    else
-      java -Xmx{params.javaram} \\
-        -jar $PICARDJARPATH/picard.jar MarkDuplicates \\
-        -I {input.bam2} \\
-        -O {output.out5} \\
-        -TMP_DIR ${{tmp}} \\
-        -VALIDATION_STRINGENCY SILENT \\
-        -REMOVE_DUPLICATES true \\
-        -METRICS_FILE {output.out6};
-      samtools index {output.out5};
-      samtools flagstat {output.out5} > {output.out5f};
-      samtools idxstats {output.out5} > {output.out5i}; 
-    fi
-    """
+        rname                               = 'dedup',
+        picardver                           = config['tools']['PICARDVER'],
+        samtoolsver                         = config['tools']['SAMTOOLSVER'],
+        bedtoolsver                         = config['tools']['BEDTOOLSVER'],
+        macsver                             = config['tools']['MACSVER'],
+        gsize                               = config['references'][genome]['EFFECTIVEGENOMESIZE'],
+        folder                              = join(workpath,bam_dir),
+        genomefile                          = config['references'][genome]['REFLEN'],
+        rver                                = config['tools']['RVER'],
+        javaram                             = '16g',
+        tmpdir                              = tmpdir,
+        tmpBam                              = "{name}.Q5DD.withXY.bam",
+        rscript                             = join(config['references'][genome]['cfChIP_TOOLS_SRC'], "bam2fragment.R"),
+    shell: 
+        """
+        module load {params.samtoolsver};
+        module load {params.picardver};
+        module load {params.bedtoolsver};
+        module load {params.macsver};
+        module load {params.rver}; 
+        if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi
+        tmp=$(mktemp -d -p "{params.tmpdir}")
+        trap 'rm -rf "${{tmp}}"' EXIT
+        
+        if [ "{assay}" == "cfchip" ];then
+        java -Xmx{params.javaram} \\
+            -jar $PICARDJARPATH/picard.jar MarkDuplicates \\
+            -I {input.bam2} \\
+            -O {params.tmpBam} \\
+            -TMP_DIR ${{tmp}} \\
+            -VALIDATION_STRINGENCY SILENT \\
+            -REMOVE_DUPLICATES true \\
+            -METRICS_FILE {output.out6};
+        samtools index {params.tmpBam};
+        samtools view -b {params.tmpBam} chr{{1..22}} > {output.out5};
+        Rscript {params.rscript} {params.tmpBam} {output.out7};
+        rm {params.tmpBam} {params.tmpBam}.bai;
+        samtools index {output.out5};
+        samtools flagstat {output.out5} > {output.out5f};
+        samtools idxstats {output.out5} > {output.out5i}; 
+        elif [ '""" + str(paired_end) + """' == False ];then
+        macs2 filterdup -i {input} -g {params.gsize} --keep-dup="auto" -o ${{tmp}}/TmpTagAlign;
+        awk -F"\\t" -v OFS="\\t" '{{if ($2>0 && $3>0) {{print}}}}' ${{tmp}}/TmpTagAlign > ${{tmp}}/TmpTagAlign2;
+        awk -F"\\t" -v OFS="\\t" '{{print $1,1,$2}}' {params.genomefile} | sort -k1,1 -k2,2n > ${{tmp}}/GenomeFileBed;
+        bedtools intersect -wa -f 1.0 -a ${{tmp}}/TmpTagAlign2 -b ${{tmp}}/GenomeFileBed > ${{tmp}}/TmpTagAlign3;
+        bedtools bedtobam -i ${{tmp}}/TmpTagAlign3 -g {params.genomefile} | samtools sort -@4 -o {output.out5};
+        gzip ${{tmp}}/TmpTagAlign3;
+        mv ${{tmp}}/TmpTagAlign3.gz {output.out7};
+        samtools index {output.out5};
+        samtools flagstat {output.out5} > {output.out5f}
+        samtools idxstats {output.out5} > {output.out5i}
+        else
+        java -Xmx{params.javaram} \\
+            -jar $PICARDJARPATH/picard.jar MarkDuplicates \\
+            -I {input.bam2} \\
+            -O {output.out5} \\
+            -TMP_DIR ${{tmp}} \\
+            -VALIDATION_STRINGENCY SILENT \\
+            -REMOVE_DUPLICATES true \\
+            -METRICS_FILE {output.out6};
+        samtools index {output.out5};
+        samtools flagstat {output.out5} > {output.out5f};
+        samtools idxstats {output.out5} > {output.out5i}; 
+        fi
+        """
 
 rule ppqt:
     input:
-        bam = lambda w : join(workpath,bam_dir,w.name + "." + w.ext + "." + extensionsDict[w.ext])
+        bam                                 = lambda w : join(bam_dir, w.name + "." + w.ext + "." + get_bam_ext(w.ext, paired_end))
     output:                                          
-        ppqt= join(workpath,bam_dir,ppqt_dir,"{name}.{ext}.ppqt"),
-        pdf= join(workpath,bam_dir,ppqt_dir,"{name}.{ext}.pdf"),
-        txt= join(workpath,bam_dir,ppqt_dir,"{name}.{ext}.ppqt.txt"),
+        ppqt                                = join(ppqt_dir, "{name}.{ext}.ppqt"),
+        pdf                                 = join(ppqt_dir, "{name}.{ext}.pdf"),
+        txt                                 = join(ppqt_dir, "{name}.{ext}.ppqt.txt"),
     params:
-        rname="ppqt",
-        samtoolsver=config['tools']['SAMTOOLSVER'],
-        rver=config['tools']['RVER'],
-        scriptPy=join(workpath,"workflow","scripts","ppqt_process.py"),
-        inputSample=(lambda w: w.name in uniq_inputs),
-        tmpdir=tmpdir,
-        paired_end = paired_end,
-        file_name = "{name}"
+        rname                               = "ppqt",
+        samtoolsver                         = config['tools']['SAMTOOLSVER'],
+        rver                                = config['tools']['RVER'],
+        scriptPy                            = join(workpath, "bin", "ppqt_process.py"),
+        tmpdir                              = tmpdir,
+        file_name                           = "{name}"
     container: config['images']['ppqt']
-    shell: """
-    if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi
-    tmp=$(mktemp -d -p "{params.tmpdir}")
-    trap 'rm -rf "${{tmp}}"' EXIT
+    shell: 
+        """
+        if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi
+        tmp=$(mktemp -d -p "{params.tmpdir}")
+        trap 'rm -rf "${{tmp}}"' EXIT
 
-    if [ '{params.paired_end}' == True ]; then
-        samtools view -b \\
-          -f 66 \\
-          -o ${{tmp}}/bam1.f66.bam {input.bam};
-        samtools index ${{tmp}}/bam1.f66.bam;
-        run_spp.R -c=${{tmp}}/bam1.f66.bam \
-                    -savp={output.pdf} -out={output.ppqt} \
-                    -tmpdir=${{tmp}} -rf
-    else
-        if [[ '{input.bam}' == *.gz ]]; then 
-            ln -s {input.bam} ${{tmp}}/{params.file_name}.Q5DD.tagAlign.gz;
-            run_spp.R -c=${{tmp}}/{params.file_name}.Q5DD.tagAlign.gz \
-                     -savp={output.pdf} -out={output.ppqt} \
-                     -tmpdir=/lscratch/$SLURM_JOBID -rf
+        if [ '""" + str(paired_end) + """' == True ]; then
+            samtools view -b \\
+            -f 66 \\
+            -o ${{tmp}}/bam1.f66.bam {input.bam};
+            samtools index ${{tmp}}/bam1.f66.bam;
+            run_spp.R -c=${{tmp}}/bam1.f66.bam \
+                        -savp={output.pdf} -out={output.ppqt} \
+                        -tmpdir=${{tmp}} -rf
         else
-            run_spp.R -c={input.bam} \
-                     -savp={output.pdf} -out={output.ppqt} \
-                     -tmpdir=/lscratch/$SLURM_JOBID -rf
+            if [[ '{input.bam}' == *.gz ]]; then 
+                ln -s {input.bam} ${{tmp}}/{params.file_name}.Q5DD.tagAlign.gz;
+                run_spp.R -c=${{tmp}}/{params.file_name}.Q5DD.tagAlign.gz \
+                        -savp={output.pdf} -out={output.ppqt} \
+                        -tmpdir=/lscratch/$SLURM_JOBID -rf
+            else
+                run_spp.R -c={input.bam} \
+                        -savp={output.pdf} -out={output.ppqt} \
+                        -tmpdir=/lscratch/$SLURM_JOBID -rf
+            fi
         fi
-    fi
-    python {params.scriptPy} -i {output.ppqt} -o {output.txt}
-    """
+        python {params.scriptPy} -i {output.ppqt} -o {output.txt}
+        """
 
 rule bam2bw:
     """
@@ -405,42 +367,46 @@ rule bam2bw:
        an associated score, RPGC
     """
     input:
-        bam = get_bam_input,
-        ppqt = get_ppqt_input
+        bam                                 = lambda w: get_bam_input(bam_dir, w, paired_end),
+        ppqt                                = lambda w: get_ppqt_input(ppqt_dir, w, paired_end),
     output:
-        outbw=join(workpath,bw_dir,"{name}.{ext}.RPGC.bw"),
+        outbw                               = join(bw_dir, "{name}.{ext}.RPGC.bw"),
     params:
-        rname="bam2bw",
-        effectivegenomesize=config['references'][genome]['EFFECTIVEGENOMESIZE'],
-        paired_end = paired_end,
-        tmpdir=tmpdir,
-        name = "{name}"
-    threads: int(allocated("threads", "bam2bw", cluster)),
-    envmodules: config['tools']['DEEPTOOLSVER'],
-    shell: """
-    if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi
-    tmp=$(mktemp -d -p "{params.tmpdir}")
-    trap 'rm -rf "${{tmp}}"' EXIT
+        rname                               = "bam2bw",
+        name                                = "{name}",
+        effectivegenomesize                 = config['references'][genome]['EFFECTIVEGENOMESIZE'],
+        paired_end                          = paired_end,
+        tmpdir                              = tmpdir,
+    threads: 
+        int(allocated("threads", "bam2bw", cluster)),
+    envmodules: 
+        config['tools']['DEEPTOOLSVER'],
+    shell: 
+        """
+        if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi
+        tmp=$(mktemp -d -p "{params.tmpdir}")
+        trap 'rm -rf "${{tmp}}"' EXIT
+
+        bam_cov_option={input.ppqt}
+        if [ '{params.paired_end}' == False ]; then
+            ppqt_len=$(awk '{{print $1}}' {input.ppqt})
+            bam_cov_option="-e ${{ppqt_len}}"
+        else 
+            bam_cov_option=--centerReads
+        fi
+        echo "printing out value of bam-cov-option $bam_cov_option"
+        
+        bamCoverage \\
+            --bam {input.bam} \\
+            -o {output.outbw} \\
+            --binSize 25 \\
+            --smoothLength 75 \\
+            --numberOfProcessors {threads} \\
+            --normalizeUsing RPGC \\
+            --effectiveGenomeSize {params.effectivegenomesize} \\
+            ${{bam_cov_option}};  
+        """
 
-    bam_cov_option={input.ppqt}
-    if [ '{params.paired_end}' == False ]; then
-        ppqt_len=$(awk '{{print $1}}' {input.ppqt})
-        bam_cov_option="-e ${{ppqt_len}}"
-    else 
-        bam_cov_option=--centerReads
-    fi
-    echo "printing out value of bam-cov-option $bam_cov_option"
-    
-    bamCoverage \\
-        --bam {input.bam} \\
-        -o {output.outbw} \\
-        --binSize 25 \\
-        --smoothLength 75 \\
-        --numberOfProcessors {threads} \\
-        --normalizeUsing RPGC \\
-        --effectiveGenomeSize {params.effectivegenomesize} \\
-        ${{bam_cov_option}};  
-    """    
 
 rule inputnorm:
     """
@@ -453,22 +419,26 @@ rule inputnorm:
        bigWig file of treatmment sample normalizes with its input control
     """
     input:
-        chip = join(workpath,bw_dir,"{name}.Q5DD.RPGC.bw"),
-        ctrl = lambda w : join(workpath,bw_dir,chip2input[w.name] + ".Q5DD.RPGC.bw")
+        bws = lambda w: ctrl_test(chip2input, w.name, bw_dir)
     output:
-        join(workpath,bw_dir,"{name}.Q5DD.RPGC.inputnorm.bw")
+        join(bw_dir, "{name}.Q5DD.RPGC.inputnorm.bw")
     params:
-        rname="inputnorm",
-    threads: int(allocated("threads", "inputnorm", cluster)),
-    envmodules: config['tools']['DEEPTOOLSVER'],
-    shell: """
-    bigwigCompare \\
-        --binSize 25 \\
-        --outFileName {output} \\
-        --outFileFormat 'bigwig' \\
-        --bigwig1 {input.chip} \\
-        --bigwig2 {input.ctrl} \\
-        --operation 'subtract' \\
-        --skipNonCoveredRegions \\
-        -p {threads}
-    """
+        rname                               = "inputnorm",
+        bigwig_declare                      = lambda w, input: f"--bigwig1 {input.bws[0]}" if len(input.bws) == 1 \
+                                                else f"--bigwig1 {input.bws[0]} --bigwig2 {input.bws[1]}"
+    threads: 
+        int(allocated("threads", "inputnorm", cluster)),
+    envmodules: 
+        config['tools']['DEEPTOOLSVER'],
+    shell: 
+        """
+        echo {input}
+        bigwigCompare \\
+            --binSize 25 \\
+            --outFileName {output} \\
+            --outFileFormat 'bigwig' \\
+            {params.bigwig_declare} \\
+            --operation 'subtract' \\
+            --skipNonCoveredRegions \\
+            -p {threads}
+        """
diff --git a/workflow/scripts/DiffBind_v2_ChIPseq.Rmd b/workflow/scripts/DiffBind_v2_ChIPseq.Rmd
deleted file mode 100644
index 031799d..0000000
--- a/workflow/scripts/DiffBind_v2_ChIPseq.Rmd
+++ /dev/null
@@ -1,260 +0,0 @@
----
-title: "DiffBind: ChIP-seq pipeline"
-output: 
-    html_document:
-        toc: true
-        toc_float:
-           collapsed: false
-        number_sections: true
-        toc_depth: 3
-        fig_width: 7
-        fig_height: 6
-params:
-    csvfile: samplesheet.csv
-    contrasts: "group1_vs_group2"
-    peakcaller: "macs"
----
-
-<style type="text/css">
-  body{
-  font-size: 12pt;
-}
-</style>
-
-```{r, include=FALSE, warning=FALSE, message=FALSE}
-## grab args
-dateandtime<-format(Sys.time(), "%a %b %d %Y - %X")
-
-csvfile <- params$csvfile
-contrasts <- params$contrasts
-peakcaller <- params$peakcaller
-```
-
-**Groups being compared:**
-    *`r contrasts`*  
-**Peak sources:**
-    *`r peakcaller`*  
-**Report generated:** 
-    *`r dateandtime`*  
-
-```{r setup, echo=FALSE, warning=FALSE,message=FALSE}
-knitr::opts_chunk$set(echo = FALSE, include=TRUE, message=FALSE, warning=FALSE, error=FALSE)
-suppressMessages(library(DT))
-suppressMessages(library(DiffBind))
-suppressMessages(library(parallel))
-```
-
-# Peak Data
-Read in sample sheet information and peak information
-```{r samples} 
-samples <- dba(sampleSheet=csvfile)
-consensus <- dba.peakset(samples,consensus=DBA_CONDITION)
-print(samples)
-```
-
-## Correlation heatmap: Only peaks
-Pearson correlation of peak positions: all samples versus all samples  
-```{r heatmap1}
-try(dba.plotHeatmap(samples,main="",cexRow=1,cexCol=1),silent=TRUE)
-```
-
-## PCA: Only peaks
-Variance of peak positions  
-```{r PCA1, fig.height=5,fig.width=5}
-try(dba.plotPCA(samples,DBA_CONDITION),silent=TRUE)
-```
-
-## Overlapping peak counts
-Number of overlapping peaks.  
-If the number of samples is greater than 4, a "consensus" peak Venn diagram is created, where
-the consensus peak set are the peaks identified in at least 2 samples for that condition. This is different
-from the consensus peak set used for differential analyses.
-```{r Venn, fig_height=4}
-if (nrow(samples$samples) < 5) {
-   dba.plotVenn(samples,1:nrow(samples$samples)) 
-} else {
-   dba.plotVenn(consensus,consensus$masks$Consensus,main="Binding Site Overlaps: 'consensus', comparing between groups")
-   try(dba.plotVenn(samples,samples$masks[[3]],main="Binding Site Overlaps: samples in Group1"),silent=TRUE)
-   try(dba.plotVenn(samples,samples$masks[[4]],main="Binding Site Overlaps: samples in Group2"),silent=TRUE)
-}
-```
-
-# Consensus peaks and counts
-Consensus peaks are peaks found in at least two samples, independent of condition.
-FRiP is of consensus peaks and will not match FRiP values calculated outside of this tool.
-```{r peaksORsummits}
-if ( grepl("narrow",samples$samples$Peaks[1]) ) {
-   summits <- TRUE
-   print ("Narrow peak calling tool.")
-   print ("Differential peaks are 250bp upstream and downstream of the summits.")
-} else if ( grepl("broad",samples$samples$Peaks[1]) ) {
-  summits <- FALSE
-  print ("Broad peak calling tool.")
-  print ("Differential peaks are consensus peaks.")
-} else {
-  summits <- FALSE
-  print ("Indeterminate peak calling tool.")
-  print ("Differential peaks are consensus peaks.")
-}
-```
-
-```{r DBcount}
-if (summits == TRUE) {
-	DBdataCounts <- dba.count(samples, summits=250)
-} else {
-	DBdataCounts <- dba.count(samples)
-}
-print(DBdataCounts)
-outfile2 <- paste0(contrasts, "-", peakcaller,"_Diffbind_consensusPeaks.bed")
-consensus2 <- dba.peakset(DBdataCounts,bRetrieve=T)
-consensus2$name <- paste0("Peak",1:length(consensus2))
-#rtracklayer::export(consensus2,outfile2)
-```
-
-## Correlation heatmap: Peaks and reads
-Pearson correlation of library-size normalized counts of consensus peaks: all samples versus all samples
-```{r heatmap2}
-try(dba.plotHeatmap(DBdataCounts,main="",cexRow=1,cexCol=1),silent=TRUE)
-```
-
-## Heatmap: Average signal across each peak
-1000 most variable consensus peaks (library-size normalized counts)
-```{r heatmap3}
-try(dba.plotHeatmap(DBdataCounts,correlations=FALSE,cexRow=1,cexCol=1),silent=TRUE)
-```
-
-## PCA: Peaks and reads
-Variation of library-size normalized counts of consensus peaks
-```{r PCA2, fig.height=5,fig.width=5}
-try(dba.plotPCA(DBdataCounts,DBA_CONDITION),silent=TRUE)
-```
-
-# Set Up Contrast
-Contrast is Group1 - Group2.
-```{r contrast}
-DBdatacontrast <- dba.contrast(DBdataCounts, minMembers=2, categories = DBA_CONDITION)
-print(DBdatacontrast)
-```
-
-# Differential Analysis
-This report shows the differential analysis with two tools: Deseq2 and EdgeR. For most
-projects, Deseq2 is the optimal tool. Both tools assume that the majority of peaks are
-not changing between the two conditions. EdgeR also assumes that there are equal numbers
-of peaks on each side of the contrast, so it normalizes the data more than Deseq2. EdgeR
-is especially useful when this assumption is true or when there are large differences in
-library size across samples. All concentrations are on log2 scale.
-
-```{r analyze}
-DBAnalysisDeseq2 <- dba.analyze(DBdatacontrast, method = DBA_DESEQ2)
-DBAnalysisEdgeR <- dba.analyze(DBdatacontrast, method = DBA_EDGER)
-```
-
-```{r report}
-DBReportDeseq2 <- dba.report(DBAnalysisDeseq2, method = DBA_DESEQ2)
-DBReportEdgeR <- dba.report(DBAnalysisEdgeR, method = DBA_EDGER)
-```
-
-## PCA {.tabset .tabset-fade}
-Variance of differential peaks only
-
-### DeSeq2 {-}
-```{r PCA3, fig.height=5,fig.width=5}
-try(dba.plotPCA(DBAnalysisDeseq2, contrast=1, method= DBA_DESEQ2),silent=TRUE)
-```
-
-### EdgeR {-}
-```{r PCA4, fig.height=5,fig.width=5}
-try(dba.plotPCA(DBAnalysisEdgeR, contrast=1, method = DBA_EDGER),silent=TRUE)
-```
-
-## MA plot {.tabset .tabset-fade}
-"Log concentration" means average concentration across all samples.
-Each dot is a consensus peak.
-
-### DeSeq2 {-}
-```{r MA_D}
-try(dba.plotMA(DBAnalysisDeseq2, method = DBA_DESEQ2),silent=TRUE)
-```
-
-### EdgeR {-}
-```{r MA_E}
-try(dba.plotMA(DBAnalysisEdgeR, method = DBA_EDGER),silent=TRUE)
-```
-
-## Volcano plot {.tabset .tabset-fade}
-Each dot is a consensus peak.
-
-### DeSeq2 {-}
-```{r Volcano1}
-try(dba.plotVolcano(DBAnalysisDeseq2, method = DBA_DESEQ2),silent=TRUE)
-```
-
-### EdgeR {-}
-```{r Volcano2}
-try(dba.plotVolcano(DBAnalysisEdgeR, method = DBA_EDGER),silent=TRUE)
-```
-
-## Heatmap: Differential {.tabset .tabset-fade}
-1000 most significant differential peaks (Deseq2 or EdgeR normalized)
-
-### DeSeq2 {-}
-```{r heatmap4D}
-try(dba.plotHeatmap(DBAnalysisDeseq2,contrast=1,method = DBA_DESEQ2,correlations=FALSE,margin=20,cexRow=1,cexCol=1),silent=TRUE)
-```
-
-### EdgeR {-}
-```{r heatmap4E}
-try(dba.plotHeatmap(DBAnalysisEdgeR,contrast=1,method = DBA_EDGER,correlations=FALSE,margin=20,cexRow=1,cexCol=1),silent=TRUE)
-```
-
-## Top 500 differentially bound peaks {.tabset .tabset-fade}
-### DeSeq2 {-}
-```{r Deseq2Report}
-outfile <- paste0(contrasts, "-", peakcaller, "_Diffbind_Deseq2.txt")
-outfile2 <- paste0(contrasts, "-", peakcaller, "_Diffbind_Deseq2.bed")
-DBReportDeseq2$name <- paste0("Peak",1:length(DBReportDeseq2))
-try(rtracklayer::export(DBReportDeseq2, outfile2),silent=TRUE)
-write.table(DBReportDeseq2, outfile, quote=F, sep="\t", row.names=F)
-D2i <- length(DBReportDeseq2)
-if (D2i == 0) {
-   i=1
-} else if (D2i > 500) {
-   i=500
-} else {
-   i=D2i
-}
-try(DT::datatable(data.frame(DBReportDeseq2)[1:i,], rownames=F),silent=TRUE)
-
-report2 <- dba.report(DBAnalysisDeseq2,method = DBA_DESEQ2,th=100,bNormalized=T,bFlip=FALSE,precision=0,bCalled=T)
-outfile3 <- paste0(contrasts, "-", peakcaller, "_Diffbind_Deseq2_fullList.txt")
-write.table(report2, outfile3, quote=F, sep="\t", row.names=F)
-```
-
-### EdgeR {-}
-```{r EdgeRReport}
-outfile <- paste0(contrasts, "-", peakcaller,"_Diffbind_EdgeR.txt")
-outfile2 <- paste0(contrasts, "-", peakcaller,"_Diffbind_EdgeR.bed")
-DBReportEdgeR$name <- paste0("Peak",1:length(DBReportEdgeR))
-try(rtracklayer::export(DBReportEdgeR, outfile2),silent=TRUE)
-write.table(DBReportEdgeR, outfile, quote=F, sep="\t", row.names=F)
-Ei <- length(DBReportEdgeR)
-if (Ei == 0) {
-   i=1
-} else if (Ei > 500) {
-   i=500
-} else {
-   i=Ei
-}
-try(DT::datatable(data.frame(DBReportEdgeR)[1:i,], rownames=F),silent=TRUE)
-
-report2 <- dba.report(DBAnalysisEdgeR,method = DBA_EDGER,th=100,bNormalized=T,bFlip=FALSE,precision=0,bCalled=T)
-outfile3 <- paste0(contrasts, "-", peakcaller, "_Diffbind_EdgeR_fullList.txt")
-write.table(report2, outfile3, quote=F, sep="\t", row.names=F)
-```
-
-## R tool version information
-```{r Info}
-sessionInfo()
-```
-
-<div class="tocify-extend-page" data-unique="tocify-extend-page" style="height: 0;"></div>
diff --git a/workflow/scripts/DiffBind_v2_ChIPseq_block.Rmd b/workflow/scripts/DiffBind_v2_ChIPseq_block.Rmd
deleted file mode 100644
index 2a508b5..0000000
--- a/workflow/scripts/DiffBind_v2_ChIPseq_block.Rmd
+++ /dev/null
@@ -1,267 +0,0 @@
----
-title: "DiffBind: ChIP-seq pipeline, paired/blocked analysis"
-output: 
-    html_document:
-        toc: true
-        toc_float:
-           collapsed: false
-        number_sections: true
-        toc_depth: 3
-        fig_width: 7
-        fig_height: 6
-params:
-    csvfile: samplesheet.csv
-    contrasts: "group1_vs_group2"
-    peakcaller: "macs"
-    dir: "/path/to/DiffBindBlock/directory"
----
-
-<style type="text/css">
-  body{
-  font-size: 12pt;
-}
-</style>
-
-```{r, include=FALSE, warning=FALSE, message=FALSE}
-## grab args
-dateandtime<-format(Sys.time(), "%a %b %d %Y - %X")
-
-csvfile <- params$csvfile
-contrasts <- params$contrasts
-peakcaller <- params$peakcaller
-```
-
-**Groups being compared:**
-    *`r contrasts`*  
-**Peak sources:**
-    *`r peakcaller`*  
-**Report generated:** 
-    *`r dateandtime`*  
-
-```{r setup, echo=FALSE, warning=FALSE,message=FALSE}
-knitr::opts_chunk$set(echo = FALSE, include=TRUE, message=FALSE, warning=FALSE, error=FALSE)
-knitr::opts_knit$set(root.dir=params$dir)
-suppressMessages(library(DT))
-suppressMessages(library(DiffBind))
-suppressMessages(library(parallel))
-```
-
-# Peak Data
-Read in sample sheet information and peak information
-```{r samples} 
-samples <- dba(sampleSheet=csvfile)
-consensus <- dba.peakset(samples,consensus=DBA_CONDITION)
-print(samples)
-```
-
-## Correlation heatmap: Only peaks
-Pearson correlation of peak positions: all samples versus all samples  
-```{r heatmap1}
-try(dba.plotHeatmap(samples,main="",cexRow=1,cexCol=1),silent=TRUE)
-```
-
-## PCA: Only peaks
-Variance of peak positions  
-```{r PCA1, fig.height=5,fig.width=5}
-try(dba.plotPCA(samples,DBA_CONDITION),silent=TRUE)
-```
-
-## Overlapping peak counts
-Number of overlapping peaks.  
-If the number of samples is greater than 4, a "consensus" peak Venn diagram is created, where
-the consensus peak set are the peaks identified in at least 2 samples for that condition. This is different
-from the consensus peak set used for differential analyses.
-```{r Venn, fig_height=4}
-if (nrow(samples$samples) < 5) {
-   dba.plotVenn(samples,1:nrow(samples$samples)) 
-} else {
-   dba.plotVenn(consensus,consensus$masks$Consensus,main="Binding Site Overlaps: 'consensus', comparing between groups")
-   try(dba.plotVenn(samples,samples$masks[[3]],main="Binding Site Overlaps: samples in Group1"),silent=TRUE)
-   try(dba.plotVenn(samples,samples$masks[[4]],main="Binding Site Overlaps: samples in Group2"),silent=TRUE)
-}
-```
-
-# Consensus peaks and counts
-Consensus peaks are peaks found in at least two samples, independent of condition.
-FRiP is of consensus peaks and will not match FRiP values calculated outside of this tool.
-```{r peaksORsummits}
-if ( grepl("narrow",samples$samples$Peaks[1]) ) {
-   summits <- TRUE
-   print ("Narrow peak calling tool.")
-   print ("Differential peaks are 250bp upstream and downstream of the summits.")
-} else if ( grepl("broad",samples$samples$Peaks[1]) ) {
-  summits <- FALSE
-  print ("Broad peak calling tool.")
-  print ("Differential peaks are consensus peaks.")
-} else {
-  summits <- FALSE
-  print ("Indeterminate peak calling tool.")
-  print ("Differential peaks are consensus peaks.")
-}
-```
-
-```{r DBcount}
-if (summits == TRUE) {
-	DBdataCounts <- dba.count(samples, summits=250)
-} else {
-	DBdataCounts <- dba.count(samples)
-}
-print(DBdataCounts)
-outfile2 <- paste0(contrasts, "-", peakcaller,"_Diffbind_consensusPeaks.bed")
-consensus2 <- dba.peakset(DBdataCounts,bRetrieve=T)
-consensus2$name <- paste0("Peak",1:length(consensus2))
-#rtracklayer::export(consensus2,outfile2)
-```
-
-## Correlation heatmap: Peaks and reads
-Pearson correlation of library-size normalized counts of consensus peaks: all samples versus all samples
-```{r heatmap2}
-try(dba.plotHeatmap(DBdataCounts,main="",cexRow=1,cexCol=1),silent=TRUE)
-```
-
-## Heatmap: Average signal across each peak
-1000 most variable consensus peaks (library-size normalized counts)
-```{r heatmap3}
-try(dba.plotHeatmap(DBdataCounts,correlations=FALSE,cexRow=1,cexCol=1),silent=TRUE)
-```
-
-## PCA: Peaks and reads
-Variation of library-size normalized counts of consensus peaks
-```{r PCA2, fig.height=5,fig.width=5}
-try(dba.plotPCA(DBdataCounts,DBA_CONDITION),silent=TRUE)
-```
-
-# Set Up Contrast
-Contrast is Group1 - Group2.
-```{r contrast}
-DBdatacontrast <- dba.contrast(DBdataCounts, minMembers=2, categories = DBA_CONDITION,
-							   block=DBA_TREATMENT)
-print(DBdatacontrast)
-```
-
-# Differential Analysis
-This report shows the differential analysis with two tools: Deseq2 and EdgeR. For most
-projects, Deseq2 is the optimal tool. Both tools assume that the majority of peaks are
-not changing between the two conditions. EdgeR also assumes that there are equal numbers
-of peaks on each side of the contrast, so it normalizes the data more than Deseq2. EdgeR
-is especially useful when this assumption is true or when there are large differences in
-library size across samples. All concentrations are on log2 scale.
-
-```{r analyze}
-DBAnalysisDeseq2 <- dba.analyze(DBdatacontrast, method = DBA_DESEQ2)
-DBAnalysisEdgeR <- dba.analyze(DBdatacontrast, method = DBA_EDGER)
-```
-
-```{r report}
-DBReportDeseq2 <- dba.report(DBAnalysisDeseq2, method = DBA_DESEQ2_BLOCK)
-DBReportEdgeR <- dba.report(DBAnalysisEdgeR, method = DBA_EDGER_BLOCK)
-```
-
-## PCA {.tabset .tabset-fade}
-Variance of differential peaks only
-
-### DeSeq2 {-}
-```{r PCA3, fig.height=5,fig.width=5}
-try(dba.plotPCA(DBAnalysisDeseq2, contrast=1, method= DBA_DESEQ2_BLOCK),silent=TRUE)
-```
-
-### EdgeR {-}
-```{r PCA4, fig.height=5,fig.width=5}
-try(dba.plotPCA(DBAnalysisEdgeR, contrast=1, method = DBA_EDGER_BLOCK),silent=TRUE)
-```
-
-## MA plot {.tabset .tabset-fade}
-"Log concentration" means average concentration across all samples.
-Each dot is a consensus peak.
-
-### DeSeq2 {-}
-```{r MA_D}
-try(dba.plotMA(DBAnalysisDeseq2, method = DBA_DESEQ2_BLOCK),silent=TRUE)
-```
-
-### EdgeR {-}
-```{r MA_E}
-try(dba.plotMA(DBAnalysisEdgeR, method = DBA_EDGER_BLOCK),silent=TRUE)
-```
-
-## Volcano plot {.tabset .tabset-fade}
-Each dot is a consensus peak.
-
-### DeSeq2 {-}
-```{r Volcano1}
-try(dba.plotVolcano(DBAnalysisDeseq2, method = DBA_DESEQ2_BLOCK),silent=TRUE)
-```
-
-### EdgeR {-}
-```{r Volcano2}
-try(dba.plotVolcano(DBAnalysisEdgeR, method = DBA_EDGER_BLOCK),silent=TRUE)
-```
-
-## Heatmap: Differential {.tabset .tabset-fade}
-1000 most significant differential peaks (Deseq2 or EdgeR normalized)
-
-### DeSeq2 {-}
-```{r heatmap4D}
-try(dba.plotHeatmap(DBAnalysisDeseq2,contrast=1,method = DBA_DESEQ2_BLOCK,
-                    correlations=FALSE,margin=20,cexRow=1,cexCol=1),silent=TRUE)
-```
-
-### EdgeR {-}
-```{r heatmap4E}
-try(dba.plotHeatmap(DBAnalysisEdgeR,contrast=1,method = DBA_EDGER_BLOCK,
-                    correlations=FALSE,margin=20,cexRow=1,cexCol=1),silent=TRUE)
-```
-
-## Top 500 differentially bound peaks {.tabset .tabset-fade}
-### DeSeq2 {-}
-```{r Deseq2Report}
-outfile <- paste0(contrasts, "-", peakcaller, "_Diffbind_Deseq2_block.txt")
-outfile2 <- paste0(contrasts, "-", peakcaller, "_Diffbind_Deseq2_block.bed")
-DBReportDeseq2$name <- paste0("Peak",1:length(DBReportDeseq2))
-try(rtracklayer::export(DBReportDeseq2, outfile2),silent=TRUE)
-write.table(DBReportDeseq2, outfile, quote=F, sep="\t", row.names=F)
-D2i <- length(DBReportDeseq2)
-if (D2i == 0) {
-   i=1
-} else if (D2i > 500) {
-   i=500
-} else {
-   i=D2i
-}
-try(DT::datatable(data.frame(DBReportDeseq2)[1:i,], rownames=F),silent=TRUE)
-
-report2 <- dba.report(DBAnalysisDeseq2,method = DBA_DESEQ2_BLOCK,
-                      th=100,bNormalized=T,bFlip=FALSE,precision=0)
-outfile3 <- paste0(contrasts, "-", peakcaller, "_Diffbind_Deseq2_block_fullList.txt")
-write.table(report2, outfile3, quote=F, sep="\t", row.names=F)
-```
-
-### EdgeR {-}
-```{r EdgeRReport}
-outfile <- paste0(contrasts, "-", peakcaller,"_Diffbind_EdgeR_block.txt")
-outfile2 <- paste0(contrasts, "-", peakcaller,"_Diffbind_EdgeR_block.bed")
-DBReportEdgeR$name <- paste0("Peak",1:length(DBReportEdgeR))
-try(rtracklayer::export(DBReportEdgeR, outfile2),silent=TRUE)
-write.table(DBReportEdgeR, outfile, quote=F, sep="\t", row.names=F)
-Ei <- length(DBReportEdgeR)
-if (Ei == 0) {
-   i=1
-} else if (Ei > 500) {
-   i=500
-} else {
-   i=Ei
-}
-try(DT::datatable(data.frame(DBReportEdgeR)[1:i,], rownames=F),silent=TRUE)
-
-report2 <- dba.report(DBAnalysisEdgeR,method = DBA_EDGER_BLOCK,
-                      th=100,bNormalized=T,bFlip=FALSE,precision=0)
-outfile3 <- paste0(contrasts, "-", peakcaller, "_Diffbind_EdgeR_block_fullList.txt")
-write.table(report2, outfile3, quote=F, sep="\t", row.names=F)
-```
-
-## R tool version information
-```{r Info}
-sessionInfo()
-```
-
-<div class="tocify-extend-page" data-unique="tocify-extend-page" style="height: 0;"></div>
\ No newline at end of file
diff --git a/workflow/scripts/DiffBind_v2_cfChIP_QC.Rmd b/workflow/scripts/DiffBind_v2_cfChIP_QC.Rmd
deleted file mode 100644
index d058cec..0000000
--- a/workflow/scripts/DiffBind_v2_cfChIP_QC.Rmd
+++ /dev/null
@@ -1,204 +0,0 @@
----
-title: "DiffBind: cfChIP-seq QC"
-output: 
-    html_document:
-        toc: true
-        toc_float:
-           collapsed: false
-        number_sections: true
-        toc_depth: 3
-        fig_width: 7
-        fig_height: 6
-params:
-    csvfile: samplesheet.csv
-    contrasts: "group1_vs_group2"
-    peakcaller: "macs"
----
-
-<style type="text/css">
-  body{
-  font-size: 12pt;
-}
-</style>
-
-```{r, include=FALSE, warning=FALSE, message=FALSE}
-## grab args
-dateandtime<-format(Sys.time(), "%a %b %d %Y - %X")
-
-csvfile <- params$csvfile
-contrasts <- params$contrasts
-peakcaller <- params$peakcaller
-```
-
-**Peak sources:**
-    *`r peakcaller`*  
-**Report generated:** 
-    *`r dateandtime`*  
-
-```{r setup, echo=FALSE, warning=FALSE,message=FALSE}
-knitr::opts_chunk$set(echo = FALSE, include=TRUE, message=FALSE, warning=FALSE, error=FALSE)
-suppressMessages(library(DiffBind))
-suppressMessages(library(parallel))
-suppressMessages(library(dplyr))
-suppressMessages(library(tidyr))
-suppressMessages(library(umap))
-suppressMessages(library(ggplot2))
-suppressMessages(library(ggrepel))
-```
-
-# Peak Data
-Read in sample sheet information and peak information
-```{r samples} 
-samples <- dba(sampleSheet=csvfile)
-
-# if samples have Condition values
-if ( sum(samples$class["Condition",] != "") == ncol(samples$class) ) {
-  consensus <- dba.peakset(samples,consensus=DBA_CONDITION, minOverlap = min(table(samples$samples$Condition)))
-}
-print(samples)
-```
-
-## Correlation heatmap: Only peaks
-Pearson correlation of peak positions: all samples versus all samples  
-```{r heatmap1}
-try(dba.plotHeatmap(samples,main="",cexRow=1,cexCol=1),silent=TRUE)
-```
-
-## PCA: Only peaks
-Variance of peak positions  
-```{r PCA1, fig.height=5,fig.width=5}
-try(dba.plotPCA(samples),silent=TRUE)
-```
-
-## Overlapping peak counts
-Number of overlapping peaks.  
-If the number of samples is greater than 4, a "consensus" peak Venn diagram is created, where
-the consensus peak set are the peaks identified in at least 2 samples for that condition. This is different
-from the consensus peak set used for differential analyses.
-```{r Venn, fig_height=4}
-if (nrow(samples$samples) < 5) {
-   dba.plotVenn(samples,1:nrow(samples$samples)) 
-} else {
-   if ( sum(samples$class["Condition",] != "") == ncol(samples$class) ) {
-      dba.plotVenn(consensus,consensus$masks$Consensus,main="Binding Site Overlaps: 'consensus', comparing between groups")
-   } else {
-      print("Consensus peaks were not called")
-   }
-}
-```
-
-# Consensus peaks and counts
-Consensus peaks are peaks found in at least two samples, independent of condition.
-FRiP is of consensus peaks and will not match FRiP values calculated outside of this tool.
-```{r peaksORsummits}
-if ( grepl("narrow",samples$samples$Peaks[1]) ) {
-   summits <- TRUE
-   print ("Narrow peak calling tool.")
-   print ("Differential peaks are 250bp upstream and downstream of the summits.")
-} else if ( grepl("broad",samples$samples$Peaks[1]) ) {
-  summits <- FALSE
-  print ("Broad peak calling tool.")
-  print ("Differential peaks are consensus peaks.")
-} else {
-  summits <- FALSE
-  print ("Indeterminate peak calling tool.")
-  print ("Differential peaks are consensus peaks.")
-}
-```
-
-```{r DBcount}
-
-if ( sum(samples$class["Condition",] != "") == ncol(samples$class) ) {
-  minOv <- min(table(samples$samples$Condition))
-} else {
-  minOv <- floor(ncol(samples$class)/3)
-}
-
-print(paste0("The minimum number of overlaps is: ", minOv))
-
-if (summits == TRUE) {
-  DBdataCounts <- dba.count(samples, summits=250, minOverlap = minOv)
-} else {
-  DBdataCounts <- dba.count(samples, minOverlap = minOv)
-}
-print(DBdataCounts)
-
-```
-
-## Correlation heatmap: Peaks and reads
-Pearson correlation of library-size normalized counts of consensus peaks: all samples versus all samples
-```{r heatmap2}
-try(dba.plotHeatmap(DBdataCounts,main="",cexRow=1,cexCol=1),silent=TRUE)
-```
-
-## Heatmap: Average signal across each peak
-1000 most variable consensus peaks (library-size normalized counts)
-```{r heatmap3}
-try(dba.plotHeatmap(DBdataCounts,correlations=FALSE,cexRow=1,cexCol=1),silent=TRUE)
-```
-
-## PCA: Peaks and reads
-Variation of library-size normalized counts of consensus peaks
-```{r PCA2, fig.height=5,fig.width=5}
-try(dba.plotPCA(DBdataCounts),silent=TRUE)
-```
-
-```{r TMM}
-vec <- c("seqnames", "start", "end", "width", "strand", samples$samples$SampleID)
-consensus2 <- dba.peakset(DBdataCounts, bRetrieve=TRUE) %>% ##extracts TMM-normalized  counts
-  as.data.frame() %>% setNames(vec) %>% arrange(start, end) %>% mutate(Peaks = paste0("Peak",1:nrow(.))) %>% 
-  dplyr::select(1:4, Peaks, samples$samples$SampleID)
-
-outfile1 <- paste0(contrasts, "-", peakcaller, "_DiffBindQC_TMMcounts.csv")
-write.csv(consensus2, outfile1, row.names = F)
-
-outfile2 <- paste0(contrasts, "-", peakcaller, "_DiffBindQC_TMMcounts.bed")
-write.table(consensus2[,c("seqnames","start","end","Peaks")],
-    outfile2, quote=F, sep="\t", row.names=F, col.names=F)
-
-counts_TMM_ALL <- consensus2
-rownames(counts_TMM_ALL) <- counts_TMM_ALL$Peaks
-counts_TMM_ALL$Peaks <- NULL
-
-counts_TMM_ALL <- counts_TMM_ALL %>% dplyr::select(5:ncol(.)) %>%
-  t() %>% log10() %>% as.data.frame(.)
-##UMAP coordinates
-set.seed(123)
-if (nrow(samples$samples) < 16) {
-  umap_coord <- umap(counts_TMM_ALL, n_neighbors= nrow(samples$samples)-1)
-} else {
-  umap_coord <- umap(counts_TMM_ALL)
-}
-umap_coord <-as.data.frame(umap_coord$layout) %>% setNames(c("UMAP1", "UMAP2"))
-
-outfile <- paste0(contrasts, "-", peakcaller, "_DiffBindQC_UMAP.csv")
-write.csv(umap_coord, outfile, row.names = F)
-```
-
-## UMAP: peaks and reads 
-```{r UMAP_plot}
-p <- ggplot(umap_coord,aes(x = UMAP1, y = UMAP2, label = samples$samples$SampleID))+ ##With labels
-  geom_point(aes(color=samples$samples$Condition), size = 3) +
-  theme_bw()+ ggtitle(paste0("log-transformed counts:", "n = ", nrow(umap_coord))) +
-  theme(plot.title = element_text(hjust = 0.5)) +
-  labs(color = "Phenotypes") + theme(text=element_text(size=15))+
-  geom_text_repel(point.size = NA, size = 2.5)
-q <- ggplot(umap_coord,aes(x = UMAP1, y = UMAP2)) + ##No labels
-  geom_point(aes(color=samples$samples$Condition), size = 3) +
-  theme_bw()+ ggtitle(paste0("log-transformed counts:", "n = ", nrow(umap_coord))) +
-  theme(plot.title = element_text(hjust = 0.5)) +
-  labs(color ="Phenotypes") +  theme(text=element_text(size=15))
-  ##geom_text_repel(point.size = NA, size = 2.5)
-p
-
-if ( sum(samples$class["Condition",] != "") == ncol(samples$class) ) {
-q
-}
-```
-
-## R tool version information
-```{r Info}
-sessionInfo()
-```
-
-<div class="tocify-extend-page" data-unique="tocify-extend-page" style="height: 0;"></div>
diff --git a/workflow/scripts/FRiP_plot.R b/workflow/scripts/FRiP_plot.R
deleted file mode 100644
index 8a81f49..0000000
--- a/workflow/scripts/FRiP_plot.R
+++ /dev/null
@@ -1,112 +0,0 @@
-## FRIP_plot.R
-## Created by Tovah Markowitz
-## June 19, 2020
-## Updated: Jan 19, 2022
-## Updated: Novemeber 4, 2022
-
-args <- commandArgs(trailingOnly = TRUE)
-folder <- args[1]
-
-library(ggplot2)
-library(rjson)
-
-merge_files <- function(folder) {
-  files <- list.files(path=paste0(folder,"/PeakQC"), pattern="FRiP_table.txt", 
-  	   	full.names=T)
-  allList <- lapply(files,read.table,header=T)
-  allData <- do.call(rbind.data.frame, allList)
-  write.table(allData, paste0(folder, "/PeakQC/FRiP_All_table.txt"), quote=F, 
-  		row.names=F, sep="\t")
-  return(allData)
-}
-
-plot_barplots <- function(inData, groupName, folder) {
-  p <- ggplot(inData,aes(x=bamsample, y=FRiP, fill=bedsample))
-  p <- p + geom_bar(position="dodge",stat = "identity") +
-    facet_wrap(.~bedtool) +
-    theme_bw() +
-    theme(axis.text.x=element_text(angle = -15, hjust = 0)) +
-    labs(title=groupName, x="bam file", y ="Fraction of Reads in Peaks (FRiP)", 
-         fill ="peak file")
-  pdf(paste0(folder, "/PeakQC/", groupName,".FRiP_barplot.pdf"))
-  print(p)
-  dev.off()
-}
-
-plot_scatterplots <- function(inData, groupName, folder) {
-  p <- ggplot(inData,aes(x=n_basesM, y=FRiP, shape=bedsample, color=bedtool))
-  p <- p + geom_point(size=2.5) +
-    facet_wrap(.~bamsample) +
-    theme_bw() + 
-    scale_x_continuous(trans = "log10") +
-    labs(title=groupName, x="Number of Bases in Peaks (M)", 
-         y="Fraction of Reads in Peaks (FRiP)",
-         shape="peak file", color="peak calling tool")
-  q <- p + annotation_logticks(sides="b")
-  pdf(paste0(folder, "/PeakQC/", groupName,".FRiP_scatterplot.pdf"))
-  tryCatch(print(q), error = function(e) {print(p)})
-  dev.off()
-}
-
-plot_barplots_self <- function(inData2, folder) {
-  p <- ggplot(inData2,aes(x=bamsample, y=FRiP, fill=groupInfo))
-  p <- p + geom_bar(position="dodge",stat = "identity") +
-    facet_wrap(.~bedtool) +
-    theme_bw() +
-    theme(axis.text.x=element_text(angle = -15, hjust = 0)) +
-    labs(title="All Samples",x="bam file", y ="Fraction of Reads in Peaks (FRiP)", 
-         fill ="Group")
-  pdf(paste0(folder, "/PeakQC/FRiP_barplot.pdf"))
-  print(p)
-  dev.off()
-}
-
-plot_scatterplots_self <- function(inData2, folder) {
-  p <- ggplot(inData2,aes(x=n_basesM, y=FRiP, shape=bedtool, color=groupInfo))
-  p <- p + geom_point(size=2.5) +
-    theme_bw() + 
-    scale_x_continuous(trans = "log10") +
-    annotation_logticks(sides="b") +
-    labs(title="All samples", x="Number of Bases in Peaks (M)", 
-         y="Fraction of Reads in Peaks (FRiP)",
-         shape="peak file", color="peak calling tool")
-  pdf(paste0(folder, "/PeakQC/FRiP_scatterplot.pdf"))
-  print(p)
-  dev.off()
-}
-
-process_json <- function(injson) {
-# to get the identities of the groups and the list of samples (ChIP and input)
-# associated with it
-  json  <- fromJSON(file = injson)
-  groupsInfo <- json$project$groups
-  inputs <- as.data.frame(json$project$peaks$inputs)
-  for (i in 1:length(groupsInfo)) {
-    tmp <- unique(unlist(inputs[names(inputs) %in% groupsInfo[[i]]]))
-    if (length(tmp) > 1) {
-       groupsInfo[[i]] <- c(groupsInfo[[i]],as.character(tmp))
-    } else if (tmp != "" ) {
-       groupsInfo[[i]] <- c(groupsInfo[[i]],as.character(tmp))
-    }
-  }
-  return(groupsInfo)
-}
-
-allData <- merge_files(folder)
-groupList <- process_json(paste0(folder,"/config.json"))
-
-for (i in 1:length(groupList)) {
-  group <- groupList[[i]]
-  groupName <- names(groupList)[i]
-  inData <- allData[which((allData$bedsample %in% group) & 
-  	    	          (allData$bamsample %in% group)),]
-  plot_barplots(inData, groupName, folder)
-  plot_scatterplots(inData, groupName, folder)
-}
-
-selfData <- allData[which(allData$bedsample == allData$bamsample),]
-groupInfo <- reshape2::melt(groupList)
-names(groupInfo) <- c("bamsample","groupInfo")
-selfData2 <- merge(selfData,groupInfo)
-plot_barplots_self(selfData2, folder)
-plot_scatterplots_self(selfData2, folder)
diff --git a/workflow/scripts/atac_nrf.py b/workflow/scripts/atac_nrf.py
deleted file mode 100644
index edf21aa..0000000
--- a/workflow/scripts/atac_nrf.py
+++ /dev/null
@@ -1,22 +0,0 @@
-from __future__ import print_function
-import sys
-
-preseq_log=sys.argv[1]
-
-with open(preseq_log, 'r') as fp:
-        for line in fp:
-            if line.startswith('TOTAL READS'):
-                tot_reads = float(line.strip().split("= ")[1])
-            elif line.startswith('DISTINCT READS'):
-                distinct_reads = float(line.strip().split('= ')[1])
-            elif line.startswith('1\t'):
-                one_pair = float(line.strip().split()[1])
-            elif line.startswith('2\t'):
-                two_pair = float(line.strip().split()[1])
-
-NRF = distinct_reads/tot_reads
-PBC1 = one_pair/distinct_reads
-PBC2 = one_pair/two_pair
-
-print("%.3f\t%.3f\t%.3f"%(NRF,PBC1,PBC2))
-
diff --git a/workflow/scripts/bam_filter_by_mapq.py b/workflow/scripts/bam_filter_by_mapq.py
deleted file mode 100644
index 12037cd..0000000
--- a/workflow/scripts/bam_filter_by_mapq.py
+++ /dev/null
@@ -1,40 +0,0 @@
-import pysam,sys
-import argparse
-
-parser = argparse.ArgumentParser(description='filter PE bamfile by mapQ values')
-parser.add_argument('-i',dest='inBam',required=True,help='Input Bam File')
-parser.add_argument('-o',dest='outBam',required=True,help='Output Bam File')
-parser.add_argument('-q',dest='mapQ',type=int,required=False,help='mapQ value ... default 6',default=6)
-args = parser.parse_args()
-
-samfile = pysam.AlignmentFile(args.inBam, "rb")
-mapq=dict()
-for read in samfile.fetch():
-        if read.is_unmapped:
-                continue
-        if read.is_supplementary:
-                continue
-        if read.is_secondary:
-                continue
-        if read.is_duplicate:
-                continue
-        if read.is_proper_pair:
-                if read.mapping_quality < args.mapQ and read.query_name in mapq:
-                        del mapq[read.query_name]
-                if read.mapping_quality >= args.mapQ  and not read.query_name in mapq:
-                        mapq[read.query_name]=1
-samfile.close()
-
-samfile = pysam.AlignmentFile(args.inBam, "rb")
-pairedreads = pysam.AlignmentFile(args.outBam, "wb", template=samfile)
-for read in samfile.fetch():
-        if read.query_name in mapq:
-                if read.is_supplementary:
-                        continue
-                if read.is_secondary:
-                        continue
-                if read.is_duplicate:
-                        continue
-                pairedreads.write(read)
-samfile.close()
-pairedreads.close()
diff --git a/workflow/scripts/blocking.py b/workflow/scripts/blocking.py
new file mode 100644
index 0000000..9f3febb
--- /dev/null
+++ b/workflow/scripts/blocking.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python3
+from os.path import join
+
+
+# ~~~ Common helper functions for blocking or controls
+
+
+def test_for_block(groupdata, contrast, blocks):
+    """ only want to run blocking on contrasts where all
+    individuals are on both sides of the contrast """
+    contrastBlock = [ ]
+    for con in contrast:
+        group1 = con[0]
+        group2 = con[1]
+        block1 = [ blocks[sample] for sample in groupdata[group1] ]
+        block2 = [ blocks[sample] for sample in groupdata[group2] ]
+        if len(block1) == len(block2):
+            if len(set(block1).intersection(block2)) == len(block1):
+                contrastBlock.append(con)
+    return contrastBlock
+
+
+def ctrl_test(ctrl_dict, input_name, in_dir):
+    sample = join(in_dir, f"{input_name}.Q5DD.RPGC.bw")
+    if input_name in ctrl_dict:
+        norm = join(in_dir, ctrl_dict[input_name] + ".Q5DD.RPGC.bw")
+        return [sample, norm]
+    return [sample]
\ No newline at end of file
diff --git a/workflow/scripts/cfChIP_signatures.R b/workflow/scripts/cfChIP_signatures.R
deleted file mode 100755
index 778b75a..0000000
--- a/workflow/scripts/cfChIP_signatures.R
+++ /dev/null
@@ -1,97 +0,0 @@
-####################
-#
-# Name: cfChIP_signatures.R
-# Created by: Tovah Markowitz, PhD
-# Bioinformatics (NCBR)/ Integrated Data Sciences Section (IDSS)
-# Research Technologies Branch/DIR/NIAID
-#
-# Created: August 9, 2022
-# 
-####################
-#
-# Purpose: To take the individual cfChIP signature tables, combine them, 
-#          and create the preferred output plot
-#
-# Functions: mergeSignatures and plotSignatures
-#
-# Requires: ggplot2 and ggprism (for plotting)
-#
-# Details: mergeSignatures will take a folder of signatures and combine them 
-#         into one long table. plotSignatures can directly take the the output 
-#         of mergeSignatures, but you can also load the data into R and filter
-#         to only include a subset of samples before running the function. Also,
-#         add a column called "Condition" either to the input txt file or the R
-#         object to group columns in the plot using that additional information.
-#
-# Function1: mergeSignatures(folder, outFile)
-#   folder:  [required] the path to the folder containing the individual signature
-#            files direct from the cfChIP tool
-#   outFile: [required] the name of the output txt file to save the data
-#
-# Function2: plotSignatures(inTXT, outPDF)
-#   inTXT: [required] either the name of the file from mergeSignatures or an
-#          an R object containing the data. Column names must match that of the
-#          output of mergeSignatures, but column order doesn't matter
-#   inPDF: [required] the name of the output pdf file to create
-#
-# Example usage:
-#   source("cfChIP_signatures.R")
-#   mergeSignatures("cfChIPtool/Output/H3K4me3/Signatures/", out.txt)
-#   plotSignatures(out.txt, out.pdf)
-#   plotSignatures(signatureDataFrame, out.pdf)
-# 
-####################
-
-mergeSignatures <- function(folder, outFile) {
-  files <- list.files(folder,full.names = T)
-  sigList <- lapply(files, read.csv)
-  samples <- gsub(".csv","",grep("csv",unlist(strsplit(files,"/")),value=T))
-  for (i in 1:length(samples)) {
-    sigList[[i]] <- data.frame(sigList[[i]],Sample=samples[i])
-  }
-  sigData <- do.call("rbind",sigList)
-  write.table(sigData, outFile, quote=F, sep="\t", row.names=F)
-}
-
-plotSignatures <- function(inTXT, outPDF) {
-  library(ggplot2)
-  library(ggprism)
-
-  if (mode(inTXT) == "character") { # if using a file name
-    sigData <- read.delim(inTXT)
-  } else { # if starting with an object in R
-   sigData <- as.data.frame(inTXT) 
-  }
-  sigData$NormalizedCounts[which(sigData$NormalizedCounts > 3)] <- 3
-  sigData$NormalizedCounts[which(sigData$NormalizedCounts < 0.15)] <- 0.15
-  sigData$qValue[which(sigData$qValue > 300)] <- 300
-  sigData$qValue[which(sigData$qValue < 5)] <- NA
-  names(sigData)[1] <- "cellType"
-
-  cellTypes <- data.frame(cellType=c("Neutrophils","Monocytes","Megakaryocyte",
-                                   "Erythroblast","T-Cells","B-Cells","NK",
-                                   "Vasculary","Adipose","Skin","Sk. Muscle",
-                                   "Brain","Heart","Lung","Breast","Digestive",
-                                   "Pancreas"),
-                        class=c(rep("Blood",7),rep("Global",4),rep("Other",6)) )
-
-  sigData2 <- merge(sigData,cellTypes)
-  sigData2$cellType <- factor(sigData2$cellType,levels=rev(cellTypes$cellType))
-
-  pdf(outPDF)
-  p <- ggplot(data=sigData2,aes(x=Sample,y=cellType,color=NormalizedCounts,size=qValue))
-  p <- p + geom_point() + 
-    scale_size(limits=c(5,300),breaks=c(5,50,100,150,200,250,300),
-               labels=paste0("e-",c(5,50,100,150,200,250,300))) +
-    scale_color_viridis_c(direction = -1, option="A") +
-    theme_prism() +
-    theme(axis.title.y = element_blank(), axis.title.x = element_blank(),
-          axis.text.x = element_text(angle = 45,vjust=0.9,hjust=1))
-  if (sum(names(sigData) == "Condition") == 1) {
-    p <- p + facet_grid(rows=vars(class),cols=vars(Condition),scales="free",space="free")
-  } else {
-    p <- p + facet_grid(rows=vars(class),scales="free",space="free")
-  }
-  print(p)
-  dev.off()
-}
\ No newline at end of file
diff --git a/workflow/scripts/common.py b/workflow/scripts/common.py
index 0e51470..6feba1c 100644
--- a/workflow/scripts/common.py
+++ b/workflow/scripts/common.py
@@ -1,4 +1,11 @@
-# Common helper functions shared across the entire workflow
+
+#!/usr/bin/env python3
+# ~~~ Common helper functions shared across the entire workflow
+import os
+from os.path import join
+from snakemake.io import expand
+
+
 def provided(samplelist, condition):
     """
     Determines if optional rules should run. If an empty list is provided to rule all,
@@ -201,4 +208,54 @@ def joint_option(prefix, valueslist):
     for v in valueslist:
         s += "{} {} ".format(prefix, v)
     s = s.rstrip()
-    return s
\ No newline at end of file
+    return s
+
+
+def mk_dir_if_not_exist(dirs):
+    if isinstance(dirs, str):
+        dirs = [dirs]
+    assert isinstance(dirs, list), 'Supplied directories should be in a list'
+    for _dir in dirs:
+        if not os.path.exists(_dir):
+            os.mkdir(_dir, mode=0o775)
+    return True
+
+
+def get_file_components(pair_ended):
+    alnexts = []
+    if pair_ended:
+        alnexts.extend(['sorted.bam', 'Q5DD.bam'])
+    else:
+        alnexts.extend(['sorted.bam', 'Q5DD_tagAlign.gz'])
+    stems = list(map(lambda x: x.split('.')[0], alnexts))
+    rpgc_exts = list(map(lambda x: x.split('.')[0] + '.RPGC', alnexts))
+    return stems, rpgc_exts, alnexts
+        
+
+def get_bam_ext(ext, pair_ended):
+    if pair_ended:
+        if ext.lower() == 'sorted':
+            return "bam"
+        elif ext == 'Q5DD':
+            return "bam"
+    else:
+        if ext.lower() == 'sorted':
+            return "bam"
+        elif ext == "Q5DD_tagAlign":
+            return "gz"
+    raise ValueError(f'Unknown file component. Pair ended: {str(pair_ended)}. Ext: {str(ext)}')
+
+
+def get_fqscreen_outputs(paired_end, samples, qc_dir):
+    outs = []
+    if paired_end:
+        outs.extend(expand(join(qc_dir, "FQscreen", "{name}.R{rn}.trim_screen.txt"), name=samples, rn=[1, 2])),
+        outs.extend(expand(join(qc_dir, "FQscreen", "{name}.R{rn}.trim_screen.png"), name=samples, rn=[1, 2])),
+        outs.extend(expand(join(qc_dir, "FQscreen2", "{name}.R{rn}.trim_screen.txt"), name=samples, rn=[1, 2])),
+        outs.extend(expand(join(qc_dir, "FQscreen2", "{name}.R{rn}.trim_screen.png"), name=samples, rn=[1, 2])),
+    else:
+        outs.extend(expand(join(qc_dir, "FQscreen", "{name}.R1.trim_screen.txt"), name=samples)),
+        outs.extend(expand(join(qc_dir, "FQscreen", "{name}.R1.trim_screen.png"), name=samples)),
+        outs.extend(expand(join(qc_dir, "FQscreen2", "{name}.R1.trim_screen.txt"), name=samples)),
+        outs.extend(expand(join(qc_dir, "FQscreen2", "{name}.R1.trim_screen.png"), name=samples)),
+    return outs
\ No newline at end of file
diff --git a/workflow/scripts/frip.py b/workflow/scripts/frip.py
deleted file mode 100644
index 113eb62..0000000
--- a/workflow/scripts/frip.py
+++ /dev/null
@@ -1,164 +0,0 @@
-#!/usr/bin/env python3
-
-"""
-Name: frip.py
-Created by: Tovah Markowitz
-Date: 06/18/20
-
-Purpose: To calculate FRiP scores, one bam file and as many bedfiles as wanted as inputs
-Currently only works with python/3.5
-"""
-
-##########################################
-# Modules
-import argparse
-from argparse import RawTextHelpFormatter
-from pybedtools import BedTool
-import pysam
-import pandas as pd
-
-##########################################
-# Functions
-
-def split_infiles(infiles):
-    """ 
-    breaks the infile string with space-delimited file names and 
-    creates a list
-    """
-    infileList = infiles.strip("\'").strip('\"').split(" ")
-    if len(infileList) == 1:
-        infileList = infileList[0].split(";")
-    return(infileList)
-
-def count_reads_in_bed(bam, bedfile, genomefile):
-    """
-    some of this comes directly from the pybedtools site; read in 
-    bed (or bed-like) file, sort it, and then count the number of 
-    reads within the regions
-    """
-    bedinfo = BedTool(bedfile)
-    bedinfo.sort(g=genomefile)
-    return (
-        BedTool(bam).intersect( bedinfo, bed=True, stream=True, )
-        ).count()
-
-def count_reads_in_bam(bam):
-    """ count the number of reads in a given bam file """
-    return( pysam.AlignmentFile(bam).mapped )
-
-def calculate_frip(nreads, noverlaps):
-    """ calculate FRiP score from nreads and noverlaps """
-    return( float(noverlaps) / nreads )
-
-def measure_bedfile_coverage(bedfile, genomefile):
-    """ calculate the number of bases covered by a given bed file """
-    bedinfo = BedTool(bedfile)
-    return( bedinfo.sort(g=genomefile).total_coverage() )
-
-def clip_bamfile_name(bamfile):
-    """ 
-    clip bam file name for table/plotting purposes; assumes file 
-    naming system matches that of Pipeliner
-    """
-    sample = bamfile.split("/")[-1].split(".")[0]
-    condition =  ".".join(bamfile.split("/")[-1].split(".")[1:-1])
-    return( sample, condition )
-
-def clip_bedfile_name(bedfile,filetype):
-    """
-    clip bed file name for table/plotting purposes; assumes file 
-    naming system matches that of Pipeliner
-    """
-    if filetype == "":
-        toolused = bedfile.split("/")[-3]
-        sample = bedfile.split("/")[-2]
-    else:
-        toolused = filetype
-        sample = bedfile.split("/")[-1].split(".")[0].strip("_peaks").strip("_broadpeaks")
-    return( toolused, sample )
-
-def process_files(bamfile, bedfiles, genome, filetypes):
-    """ 
-    this is the main function to take in list of input files and 
-    put out an array containing key file name information, read 
-    counts, and FRiP scores
-    """
-    bedfileL = bedfiles
-    filetypesL = filetypes
-    out = [[ "bedtool", "bedsample", "bamsample", "bamcondition", 
-    "n_reads", "n_overlap_reads", "FRiP", "n_basesM" ]]
-    nreads = count_reads_in_bam(bamfile)
-    (bamsample, condition) = clip_bamfile_name(bamfile)
-    for i in range(len(bedfileL)):
-        bed = bedfileL[i]
-        if len(filetypesL) > 1:
-            filetype = filetypesL[i]
-        else:
-            filetype = filetypesL[0]
-        (bedtool, bedsample) = clip_bedfile_name(bed,filetype)
-        noverlaps = count_reads_in_bed(bamfile, bed, genome)
-        frip = calculate_frip(nreads, noverlaps)
-        nbases = measure_bedfile_coverage(bed, genome) / 1000000
-        out.append( [bedtool, bedsample, bamsample, condition, 
-                nreads, noverlaps, frip, nbases] )
-    out2 = pd.DataFrame(out[1:], columns=out[0])
-    return(out2)
-
-def create_outfile_name(bamfile, outroot):
-    """ uses outroot to create the output file name """
-    (bamsample, condition) = clip_bamfile_name(bamfile)
-    outtable = bamsample + "." + condition + "." + "FRiP_table.txt"
-    if outroot != "":
-        outtable = outroot + "." + outtable
-    return(outtable)
-
-def write_table(out2, outtable):
-    out2.to_csv(outtable,sep='\t',index=False)
-
-
-###############################################
-# Main
-
-def main():
-    desc="""
-This function takes a space-delimited or semi-colon delimited list
-of bed-like files (extensions must be recognizable by bedtools)
-and a single bam file. It will then calculate the FRiP score for
-all possible combinations of files and save the information in a
-txt file. It will also calculate the number of bases covered by 
-each bed-like file. Note: this function assumes that the file 
-naming system of the input files matches that of Pipeliner.
-    """
-
-    parser = argparse.ArgumentParser(description=desc, formatter_class=RawTextHelpFormatter)
-    parser.add_argument('-p', nargs = '+', required=True, type=str, help='A space- or semicolon-delimited list of peakfiles \
-(or bed-like files).')
-    parser.add_argument('-b', required=True, type=str, help='The name of a bamfile to analyze.')
-    parser.add_argument('-g', required=True, type=str, help='The name of the .genome file so bedtools knows the \
-size of every chromosome.')
-    parser.add_argument('-o', required=True, type=str, help='The root name of the multiple output files. Default:""')
-    parser.add_argument('-t', required=False, default=[""], type=list, help='A space- \
-or semicolon-delimited list of input file sources/types. Only needed when \
-source of bed file is not built into the script. Default: ""')
-
-    args = parser.parse_args()
-    bedfiles = args.p
-    bamfile = args.b
-    genomefile = args.g
-    outroot = args.o
-    filetypes = args.t
-
-    out2 = process_files(bamfile, bedfiles, genomefile, filetypes)
-    outtable = create_outfile_name(bamfile, outroot)
-    write_table(out2, outtable)
-
-if __name__ == '__main__':
-    main()
-
-###############################################
-# example cases
-
-#bedfiles = "macs_broad/mWT_HCF1_mm_i81/mWT_HCF1_mm_i81_peaks.broadPeak macs_broad/mWT_HCF1_mm_i89/mWT_HCF1_mm_i89_peaks.broadPeak"
-#bamfiles = "bam/Input_mm_i95.sorted.Q5DD.bam bam/mWT_HCF1_mm_i81.sorted.Q5DD.bam bam/mWT_HCF1_mm_i89.sorted.Q5DD.bam"
-#genomefile = "/data/CCBR_Pipeliner/db/PipeDB/Indices/mm10_basic/indexes/mm10.fa.sizes"
-#out2 = pd.read_csv("FRIP_test.txt",sep="\t")
diff --git a/workflow/scripts/grouping.py b/workflow/scripts/grouping.py
index 6e39613..b8214cb 100644
--- a/workflow/scripts/grouping.py
+++ b/workflow/scripts/grouping.py
@@ -1,4 +1,7 @@
 #!/usr/bin/env python3
+# ~~~ Common helper functions for grouping of outputs
+from os.path import join
+
 # common functions related to sample grouping or group meta-information
 def group_samples_by_reps(groupdata, samples, chip2input):
     groupdatawinput = {}
@@ -47,8 +50,11 @@ def group_output_files(extensions, groupslist, inputnorm):
     
     return dtoolgroups, dtoolext
 
+
 def zip_contrasts(contrast, PeakTools):
-    """making output file names for differential binding analyses"""
+    """
+        making output file names for differential binding analyses
+    """
     zipGroup1, zipGroup2, zipTool, contrasts = [], [], [], []
     for g1, g2 in contrast:
         for PeakTool in PeakTools:
@@ -56,4 +62,48 @@ def zip_contrasts(contrast, PeakTools):
             zipGroup2.append(g2)
             zipTool.append(PeakTool)
             contrasts.append( g1 + "_vs_" + g2 + "-" + PeakTool )
-    return(zipGroup1, zipGroup2, zipTool, contrasts)
\ No newline at end of file
+    return(zipGroup1, zipGroup2, zipTool, contrasts)
+
+
+def get_peaktools(assay_type):
+    tools = ["macsNarrow"]
+    if assay_type == "atac": 
+        tools.append("Genrich") 
+    elif assay_type == "chip":
+        tools.extend(["macsBroad", "sicer"])
+    return tools
+
+
+def dedup_out7(input, assay, paired_end):
+    dd = []
+    if assay == "cfchip":
+        dd.append(input + ".Q5DD_tagAlign")
+    elif paired_end == False and assay == "chip":
+        dd.append(input + ".Q5DD_tagAlign.gz")
+    return dd
+
+
+def get_ppqt_input(ppqt_dir, wildcards, paired_end):
+    ppqt = []
+    if paired_end:
+        ppqt.append(join(ppqt_dir, "{0}.{1}.ppqt.txt".format(wildcards.name, wildcards.ext)))
+    else:
+        if wildcards.ext == "Q5DD":
+            ppqt.append(join(ppqt_dir, "{0}.Q5DD_tagAlign.ppqt.txt".format(wildcards.name)))
+        elif wildcards.ext == "sorted":
+            ppqt.append(join(ppqt_dir, "{0}.sorted.ppqt.txt".format(wildcards.name)))
+        else:
+            raise ValueError(f'Unknown alignment file extension, name: {wildcards.name}, ext: {wildcards.ext}.')
+    return ppqt
+
+
+def get_bam_input(bam_dir, wildcards, paired_end):
+    bams = []
+    if paired_end:
+        bams.append(join(bam_dir, "{0}.{1}.bam".format(wildcards.name, wildcards.ext)))
+    else:
+        if wildcards.ext == "Q5DD":
+            bams.append(join(bam_dir, "{0}.Q5DD.bam".format(wildcards.name)))
+        elif wildcards.ext == "sorted":
+            bams.append(join(bam_dir, "{0}.sorted.bam".format(wildcards.name)))
+    return bams
\ No newline at end of file
diff --git a/workflow/scripts/jaccard_score.py b/workflow/scripts/jaccard_score.py
deleted file mode 100644
index 378ee69..0000000
--- a/workflow/scripts/jaccard_score.py
+++ /dev/null
@@ -1,202 +0,0 @@
-#!/usr/bin/env python3
-
-"""
-Name: jaccard_score.py
-Created by: Tovah Markowitz
-Date: 1/23/19
-Updated: 8/5/19 to compare multiple tools and create plots
-
-Purpose: To do all pairwise comparisons of bed/peak files given. Uses bedtools
-to calculate a jaccard score for every comparison. All data is saved in a 
-single tab-delimited file.
-"""
-
-##########################################
-# Modules
-import optparse
-from pybedtools import BedTool
-import pandas as pd
-from sklearn.decomposition import PCA as sklearnPCA
-import matplotlib as mpl
-mpl.use('Agg')
-import matplotlib.pyplot as plt
-import seaborn as sns
-
-##########################################
-# Functions
-
-def split_infiles(infiles):
-    """ breaks the infile string with space-delimited file names and creates a list.
-    also works for infile types
-    """
-    infileList = infiles.strip("\'").strip('\"').split(" ")
-    if len(infileList) == 1:
-        infileList = infileList[0].split(";")
-    return(infileList)
-
-def loop_jaccard(infileList, genomefile, filetypeList):
-    """ Uses two loops to do all possible pairwise comparisons of files 
-    in a list. Returns a writeable output and a pandas object
-    """
-    nfiles = len(infileList)
-    (colnames, snames) = get_colnames(infileList, filetypeList)
-    out = [[1.000] * nfiles for i in range(nfiles)]
-    outTable = []
-    for z in range(nfiles):
-        fileA = infileList[z]
-        print("fileA is: " + fileA) 
-        for y in range(z+1,nfiles):
-            fileB = infileList[y]
-            (data, keylist) = run_jaccard(fileA, fileB, genomefile)
-            out[z][y] = data[3]
-            out[y][z] = data[3]
-            if filetypeList != [""]:
-                keylist.insert(1, "toolA")
-                keylist.insert(3, "toolB")
-                data.insert(1, filetypeList[z])
-                data.insert(3, filetypeList[y])
-            if len(outTable) == 0:
-                outTable.append( "\t".join(keylist) )
-            outTable.append( "\t".join(data) )
-        out2 = pd.DataFrame(out, columns=colnames, index=colnames,dtype="float")
-    return(outTable, out2, snames)
-
-def run_jaccard(fileA, fileB, genomefile):
-    """ Running bedtools. Reads in two bedtools approved file types, sorts the files, 
-    and calculates a jaccard score.
-    """
-    a = BedTool(fileA)
-    a = a.sort(g=genomefile)
-    b = BedTool(fileB)
-    b = b.sort(g=genomefile)
-    j = a.jaccard(b,g=genomefile)
-    j["fileA"] = fileA.split("/")[-1]
-    j["fileB"] = fileB.split("/")[-1]
-    keylist = list(j.keys())
-    keylist.sort()
-    data = [ str(j[key]) for key in keylist ]
-    return(data, keylist)
-
-def get_colnames(infileList, filetypeList):
-    snames = [ i.split("/")[-1].split(".")[0].strip("_peaks").strip("_broadpeaks") for i in infileList ]
-    if filetypeList == [""]:
-        colnames = snames
-    else:
-        colnames = [ snames[i] + "_" + filetypeList[i] for i in range(len(snames)) ]
-    return(colnames, snames)
-
-def create_outfile_names(outroot):
-    """ uses outroot to create the output file names """
-    outTableFile = "jaccard.txt"
-    outPCAFile = "jaccard_PCA.pdf"
-    outHeatmapFile = "jaccard_heatmap.pdf"
-    if outroot != "":
-        if outroot[-1] == "/":
-            outTableFile= outroot + outTableFile
-            outPCAFile = outroot + outPCAFile
-            outHeatmapFile = outroot + outHeatmapFile
-        else:
-            outTableFile= outroot + "_" + outTableFile
-            outPCAFile = outroot + "." + outPCAFile
-            outHeatmapFile = outroot + "." + outHeatmapFile
-    return(outTableFile, outPCAFile, outHeatmapFile)
-
-def pca_plot(out, filetypeList, snames, outPCAFile):
-    """ creates a 2D PCA plot comparing the files based upon jaccard scores
-    """
-    sklearn_pca = sklearnPCA(n_components=2)
-    Y_sklearn = sklearn_pca.fit_transform(out)
-    PCAdata = pd.DataFrame(Y_sklearn,columns=["PC1","PC2"])
-    PCAdata.insert(0,"sample name",snames)
-    fig, ax =plt.subplots()
-    snames_pal = sns.hls_palette(len(set(snames)),s=.8)
-    sns.set_palette(snames_pal)
-    if filetypeList != [""]:
-        PCAdata.insert(1,"tool",filetypeList)
-        ax = sns.scatterplot(x="PC1",y="PC2",hue="sample name",style="tool",data=PCAdata,s=100)
-    else:
-        ax = sns.scatterplot(x="PC1",y="PC2",hue="sample name",data=PCAdata,s=100)
-    ax.axhline(y=0, color='grey', linewidth=1,linestyle="--")
-    ax.axvline(x=0, color='grey', linewidth=1,linestyle="--")
-    ax.set(xlabel= "PC1 (" + str(round(100*sklearn_pca.explained_variance_[0],2)) + "%)",
-           ylabel= "PC2 (" + str(round(100*sklearn_pca.explained_variance_[1],2)) + "%)")
-    plt.legend(bbox_to_anchor=(1.05, 1), loc=2)
-    #plt.show()
-    plt.savefig(outPCAFile, bbox_inches='tight')
-    plt.close("all")
-
-def plot_heatmap(out, outHeatmapFile, snames, filetypeList):
-    snames_pal = sns.hls_palette(len(set(snames)),s=.8)
-    snames_lut = dict(zip(set(snames), snames_pal))
-    snames_cols = pd.Series(snames,index=out.index).map(snames_lut)
-    if filetypeList != [""]:
-       tool_pal = sns.cubehelix_palette(len(set(filetypeList)))
-       tool_lut = dict(zip(set(filetypeList), tool_pal))
-       tool_cols = pd.Series(filetypeList,index=out.index).map(tool_lut)
-       g = sns.clustermap(out,cmap="YlGnBu",col_cluster=False,
-                    row_colors=[snames_cols,tool_cols])
-       for label in set(snames):
-            g.ax_col_dendrogram.bar(0, 0, color=snames_lut[label],
-                            label=label, linewidth=0)
-       for label in set(filetypeList):
-            g.ax_col_dendrogram.bar(0, 0, color=tool_lut[label],
-                            label=label, linewidth=0)
-       g.ax_col_dendrogram.legend(loc="center", ncol=3, 
-                                bbox_to_anchor=(0.4, 0.8))
-    else:
-       g = sns.clustermap(out,cmap="YlGnBu",col_cluster=False,
-                    row_colors=snames_cols)
-       for label in set(snames):
-            g.ax_col_dendrogram.bar(0, 0, color=snames_lut[label],
-                            label=label, linewidth=0)
-       g.ax_col_dendrogram.legend(loc="center", ncol=3, 
-                                bbox_to_anchor=(0.5, 0.8))
-    #plt.show()
-    plt.savefig(outHeatmapFile, bbox_inches='tight')
-    plt.close("all")
-
-def write_out(out, outFile):
-    f = open(outFile, 'w')
-    f.write( "\n".join(out) )
-    f.close()
-
-##########################################
-# Main
-
-def main():
-    desc="""
-    This function takes a space-delimited list of files (bed, bedgraph, gff, gtf, etc.)
-    and calculates all possible pairwise jaccard scores. From bedtools: 'Jaccard is the 
-    length of the intersection over the union. Values range from 0 (no intersection) to 
-    1 (self intersection)'. The columns of the output file are: fileA, fileB, 
-    intersection, jaccard, n_intersections, and union-intersection.
-    """
-
-    parser = optparse.OptionParser(description=desc)
-
-    parser.add_option('-i', dest='infiles', default='', help='A space- or semicolon-delimited list of \
-input files for analysis.')
-    parser.add_option('-t', dest='filetypes', default='', help='A space- or semicolon-delimited list \
-of input file sources/types.')
-    parser.add_option('-o', dest='outroot', default='', help='The root name of the output files \
-where all the jaccard score information will be saved.')
-    parser.add_option('-g', dest='genomefile', default='', help='The name of the .genome file.')
-
-    (options,args) = parser.parse_args()
-    infiles = options.infiles
-    filetypes = options.filetypes
-    outroot = options.outroot
-    genomefile = options.genomefile
-
-    infileList = split_infiles(infiles)
-    filetypeList = split_infiles(filetypes)
-    (outTable, out, snames) = loop_jaccard(infileList, genomefile, filetypeList)
-    (outTableFile, outPCAFile, outHeatmapFile) = create_outfile_names(outroot)
-    write_out(outTable, outTableFile)
-    pca_plot(out, filetypeList, snames, outPCAFile)
-    plot_heatmap(out, outHeatmapFile, snames, filetypeList)
-
-if __name__ == '__main__':
-    main()
-
-
diff --git a/workflow/scripts/peakcall.py b/workflow/scripts/peakcall.py
new file mode 100644
index 0000000..e3b07e1
--- /dev/null
+++ b/workflow/scripts/peakcall.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python3
+from os.path import join
+
+def get_input_bam(input_sample, bam_dir):
+    """
+    Returns a ChIP samples input BAM file,
+    see chip2input for ChIP, Input pairs.
+    """
+    if input_sample:
+        # Runs in a ChIP, input mode
+        return join(bam_dir, "{0}.Q5DD.bam".format(input_sample))
+    # Runs in ChIP-only mode
+    return []
+
+
+def get_control_input(ext, paired_end, bam_dir):
+    i = []
+    if paired_end and ext != "":
+        i = [join(bam_dir, "{0}.Q5DD.bam".format(ext))]
+    elif not paired_end and ext != "":
+        i = [join(bam_dir, "{0}.Q5DD_tagAlign.gz".format(ext))]
+    return i
+
+
+def outputIDR(groupswreps, groupdata, chip2input, tools):
+    """
+    Produces the correct output files for IDR. All supposed replicates
+    should be directly compared when possible using IDR. IDR malfunctions
+    with bed files and GEM so it will not run with either of those.
+    Because there is no q-value calculated for SICER when there is no 
+    input file, those samples are also ignored.
+    """
+    IDRgroup, IDRsample1, IDRsample2, IDRpeaktool = [], [], [], []
+    for group in groupswreps:
+        nsamples = len(groupdata[group])
+        for i in range(nsamples):
+            ctrlTF = chip2input[groupdata[group][i]] != ""
+            for j in range(i+1,nsamples):
+                if ctrlTF == (chip2input[groupdata[group][j]] != ""):
+                    if ctrlTF == False:
+                        tooltmp = [ tool for tool in tools if tool != "sicer" ]
+                    else:
+                        tooltmp = tools			           
+                    IDRgroup.extend([group] * len(tooltmp))
+                    IDRsample1.extend([groupdata[group][i]] * len(tooltmp))
+                    IDRsample2.extend([groupdata[group][j]] * len(tooltmp))
+                    IDRpeaktool.extend(tooltmp)
+    return( IDRgroup, IDRsample1, IDRsample2, IDRpeaktool )
+
+
+def zip_peak_files(chips, PeakTools, PeakExtensions):
+    """Making input file names for FRiP"""
+    zipSample, zipTool, zipExt = [], [], []
+    for chip in chips:
+        for PeakTool in PeakTools:
+            zipSample.append(chip)
+            zipTool.append(PeakTool)
+            zipExt.append(PeakExtensions[PeakTool])
+    return(zipSample, zipTool, zipExt)
+
+
+def calc_effective_genome_fraction(effectivesize, genomefile):
+    """
+    calculate the effective genome fraction by calculating the
+    actual genome size from a .genome-like file and then dividing
+    the effective genome size by that number
+    """
+    lines=list(map(lambda x:x.strip().split("\t"),open(genomefile).readlines()))
+    genomelen=0
+    for chrom,l in lines:
+        if not "_" in chrom and chrom!="chrX" and chrom!="chrM" and chrom!="chrY":
+            genomelen+=int(l)
+    return(str(float(effectivesize)/ genomelen))
+
+
+def getMacChip(bam_dir, name, paired_end): 
+    if paired_end:
+        chip = join(bam_dir, name + ".Q5DD.bam")
+    else:
+        chip = join(bam_dir, name + ".Q5DD_tagAlign.gz")
+    return chip
+
+
+def getMacTXT(ppqt_dir, name, paired_end): 
+    if paired_end:
+        txt = join(ppqt_dir, name + ".Q5DD.ppqt.txt")
+    else:
+        txt = join(ppqt_dir, name + ".Q5DD_tagAlign.ppqt.txt")
+    return txt
+
+
+def getSicerChips(bam_dir, name, paired_end):
+    if paired_end:
+        chip = join(bam_dir, name + ".Q5DD.bam")
+    else:
+        chip = join(bam_dir, name + ".Q5DD_tagAlign.gz")
+    return chip
+
+
+def getSicerFragLen(ppqt_dir, qc_dir, name, paired_end):
+    if paired_end:
+        fragLen = join(qc_dir, name + ".Q5DD.insert_size_metrics.txt")
+    else:
+        fragLen = join(ppqt_dir, name + ".Q5DD_tagAlign.ppqt.txt")
+    return fragLen
\ No newline at end of file
diff --git a/workflow/scripts/ppqt_process.py b/workflow/scripts/ppqt_process.py
deleted file mode 100644
index 9a2c9b1..0000000
--- a/workflow/scripts/ppqt_process.py
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/usr/bin/env python3
-
-#Purpose: To grab the estimated fragment length from the ppqt output and a small txt with that information. For input files, adding an extra value of 200bp as an alternative.
-import argparse
-parser = argparse.ArgumentParser(description='Script to extract the the estimated fragment length from the ppqt output.')
-parser.add_argument('-i', required=True,help='Name of the ppqt txt file')
-parser.add_argument('-o', required=True,help='Name of the output file')
-args = parser.parse_args()
-
-output = args.o
-inppqt = args.i
-
-o=open(output,'w')
-
-file = list(map(lambda z:z.strip().split(),open(inppqt,'r').readlines()))
-
-
-ppqt_values = file[0][2].split(",")
-extenders = []
-for ppqt_value in ppqt_values:
-    if int(ppqt_value) > 150:
-        extenders.append(ppqt_value)
-if len(extenders) > 0:
-    o.write(extenders[0])
-else:
-    o.write("200")      
-o.close()
\ No newline at end of file
diff --git a/workflow/scripts/prep_diffbind.py b/workflow/scripts/prep_diffbind.py
deleted file mode 100644
index 4ba7d64..0000000
--- a/workflow/scripts/prep_diffbind.py
+++ /dev/null
@@ -1,54 +0,0 @@
-#!/usr/bin/env python3
-
-import json
-import argparse
-
-parser = argparse.ArgumentParser(description='Script to prepare the DiffBind input csv')
-parser.add_argument('--g1',dest='group1',required=True,help='Name of the first group')
-parser.add_argument('--g2',dest='group2',required=True,help='Name of the second group')
-parser.add_argument('--wp',dest='workpath',required=True,help='Full path of the working directory')
-parser.add_argument('--pt',dest='peaktool',required=True,help='Name of the the peak calling tool, also the directory where the peak file will be located')
-parser.add_argument('--pe',dest='peakextension',required=True,help='The file extension of the peakcall output')
-parser.add_argument('--pc',dest='peakcaller',required=True,help='Value for the PeakCaller column of the DiffBind csv')
-parser.add_argument('--bd',dest='bamdir',required=True,help='Name of the directory where the bam files are located')
-parser.add_argument('--csv',dest='csvfile',required=True,help='Name of the output csv file')
-
-args = parser.parse_args()
-
-with open("config.json","r") as read_file:
-   config=json.load(read_file)
-   
-chip2input = config['project']['peaks']['inputs']
-groupdata = config['project']['groups']
-blocks = config['project']['blocks']
-
-if None in list(blocks.values()):
-    samplesheet = [",".join(["SampleID","Condition", "Replicate", "bamReads", 
-         "ControlID", "bamControl", "Peaks", "PeakCaller"])]
-else:
-    samplesheet = [",".join(["SampleID","Condition","Treatment","Replicate", "bamReads", 
-         "ControlID", "bamControl", "Peaks", "PeakCaller"])]
-   
-
-for condition in args.group1, args.group2:
-    for chip in groupdata[condition]:
-        replicate = str([ i + 1 for i in range(len(groupdata[condition])) if groupdata[condition][i]== chip ][0])
-        bamReads = args.workpath + "/" + args.bamdir + "/" + chip + ".Q5DD.bam"
-        controlID = chip2input[chip]
-        if controlID != "":
-            bamControl = args.workpath + "/" + args.bamdir + "/" +  controlID + ".Q5DD.bam"
-        else:
-            bamControl = ""
-        peaks = args.workpath + "/" + args.peaktool + "/" + chip + "/" + chip + args.peakextension
-        if None in list(blocks.values()):
-            samplesheet.append(",".join([chip, condition, replicate, bamReads, 
-                   controlID, bamControl, peaks, args.peakcaller]))
-        else:
-            block = blocks[chip]
-            samplesheet.append(",".join([chip, condition, block, replicate, bamReads, 
-                   controlID, bamControl, peaks, args.peakcaller]))
-            
-
-f = open(args.csvfile, 'w')
-f.write ("\n".join(samplesheet))
-f.close()
diff --git a/workflow/scripts/prep_diffbindQC.py b/workflow/scripts/prep_diffbindQC.py
deleted file mode 100644
index 550b5f9..0000000
--- a/workflow/scripts/prep_diffbindQC.py
+++ /dev/null
@@ -1,51 +0,0 @@
-#!/usr/bin/env python3
-
-import json
-import argparse
-
-parser = argparse.ArgumentParser(description='Script to prepare the DiffBind input csv')
-parser.add_argument('--wp',dest='workpath',required=True,help='Full path of the working directory')
-parser.add_argument('--pt',dest='peaktool',required=True,help='Name of the the peak calling tool, also the directory where the peak file will be located')
-parser.add_argument('--pe',dest='peakextension',required=True,help='The file extension of the peakcall output')
-parser.add_argument('--pc',dest='peakcaller',required=True,help='Value for the PeakCaller column of the DiffBind csv')
-parser.add_argument('--bd',dest='bamdir',required=True,help='Name of the directory where the bam files are located')
-parser.add_argument('--csv',dest='csvfile',required=True,help='Name of the output csv file')
-
-args = parser.parse_args()
-
-with open("config.json","r") as read_file:
-   config=json.load(read_file)
-   
-chip2input = config['project']['peaks']['inputs']
-groupdata = config['project']['groups']
-
-tmpIDs = [x for xs in groupdata.values() for x in xs]
-Ncounts = [tmpIDs.count(tmp) for tmp in set(tmpIDs)]
-
-samplesheet = [",".join(["SampleID","Condition", "Replicate", "bamReads", 
-         "ControlID", "bamControl", "Peaks", "PeakCaller"])]
-
-count = 1
-for chip in chip2input.keys():
-   if set(Ncounts) == {1}: # if all samples only in one group
-      for key in groupdata.keys():
-          if chip in groupdata[key]:
-             condition = key
-      replicate = str([ i + 1 for i in range(len(groupdata[condition])) if groupdata[condition][i]== chip ][0])
-   else:
-      condition = ""
-      replicate = str(count)
-      count = count +1
-   bamReads = args.workpath + "/" + args.bamdir + "/" + chip + ".Q5DD.bam"
-   controlID = chip2input[chip]
-   if controlID != "":
-      bamControl = args.workpath + "/" + args.bamdir + "/" +  controlID + ".Q5DD.bam"
-   else:
-      bamControl = ""
-   peaks = args.workpath + "/" + args.peaktool + "/" + chip + "/" + chip + args.peakextension
-   samplesheet.append(",".join([chip, condition, replicate, bamReads, 
-                   controlID, bamControl, peaks, args.peakcaller]))            
-
-f = open(args.csvfile, 'w')
-f.write ("\n".join(samplesheet))
-f.close()
diff --git a/workflow/scripts/promoterAnnotation_by_Gene.R b/workflow/scripts/promoterAnnotation_by_Gene.R
deleted file mode 100755
index 846cfc0..0000000
--- a/workflow/scripts/promoterAnnotation_by_Gene.R
+++ /dev/null
@@ -1,179 +0,0 @@
-####################
-#
-# Name: promoterAnnotationByGene.R
-# Created by: Tovah Markowitz, PhD
-# Bioinformatics (NCBR)/ Integrated Data Sciences Section (IDSS)
-# Research Technologies Branch/DIR/NIAID
-#
-# Created: August 8, 2022
-# Updated: October 26, 2022 to work with uropa 4.0.2
-# Updated: November 3, 2022 to fit with pipeline
-# 
-####################
-#
-# Purpose: To take UROPA allhits output files using "TSSprot" conditions and
-#          create a table of which genes have annotations overlapping their 
-#          promoters and how many times. Output format: dataframe
-#
-# Details: Promoters will be defined as 3kb upstream to 1 kb downstream of the 
-#          TSS. Allhits files were chosen to capture information from "peaks" 
-#          overlappingmultiple promoters. Finalhits files can also be processed 
-#          with this pipeline. This script can handle multiple allhits files as 
-#          long as there are equal numbers of sampleNames to go with them. Also, 
-#          giving a matching DiffBind txt file will allow the allhits file to be 
-#          filtered to only include the significant differential peaks or to 
-#          split the data by the direction of log fold-change.
-#
-# Requires: GenomicRanges, tidyr
-#
-# Function: promoterAnnotationByGene(allhitsFiles, sampleNames, diffbindFiles=NA, direction=NA)
-# 
-# Variables:
-#     allhitsFiles:  [required] a vector of allhits files to process
-#     sampleNames:   [required] a vector of short names for each allhits file 
-#                               to use as column headers
-#     diffbindFiles: [optional] a vector of diffbind files to use to filter each 
-#                               allhits file
-#     direction:     [optional] when filtering using diffbindFiles, define how to
-#                               filter using log fold change. "Both" is default
-#                               when not defined by user.
-#                               Options: "both", "pos", "neg", "separate"
-#
-# Example usage:
-#     source("promoterAnnotation_by_Gene.R")
-#     out1 <- promoterAnnotationByGene(allhitsA.txt, "A")
-#     out2 <- promoterAnnotationByGene(allhitsA.txt, "A", diffbindA.txt, "both")
-#     out3 <- promoterAnnotationByGene(allhitsFiles= c(allhitsA.txt, allhitsB.txt), 
-#                                      sampleNames=c("A","B"), 
-#                                      diffbindFiles=c(diffbindA.txt,diffbindB.txt), 
-#                                      direction="pos")
-#     out4 <- promoterAnnotationByGene(allhitsFiles= c(allhitsA.txt, allhitsA.txt), 
-#                                      sampleNames=c("Deseq2","EdgeR"), 
-#                                      diffbindFiles=c(Deseq2.txt,EdgeR.txt), 
-#                                      direction="separate")
-# 
-####################
-
-
-allhits2promoter <- function(allhitsFile) {
-  # cleaning up the allhits file to only keep information about peaks 
-  # overlapping promoters
-  inData <- read.delim(allhitsFile)
-  tmp <- which(inData$name == "query_1")
-  if (length(tmp) == 0) {
-    print (paste0("Supplied file ", allhitsFile, " has no peaks overlapping promoters."))
-  } else {
-    promoterData <- inData[tmp,]
-    promoterData <- promoterData[,c("peak_chr", "peak_start", "peak_end", "gene_id", "gene_name")]
-    return(promoterData)
-  }
-}
-
-filterPromoter <- function(Diffbind, promoterData, sampleName) {
-  # used by DiffbindFilterPromoter
-  promoterData2 <- GenomicRanges::makeGRangesFromDataFrame(promoterData, seqnames.field="peak_chr",
-                                  start.field="peak_start", end.field="peak_end",
-                                  starts.in.df.are.0based=F)
-  Diffbind2 <- GenomicRanges::makeGRangesFromDataFrame(Diffbind)
-  ov <- GenomicRanges::countOverlaps(promoterData2,Diffbind2,type = "equal",maxgap=1)
-  promoterData3 <- promoterData[which(ov != 0),]
-  promoterData3$sample_id <- sampleName
-  return(promoterData3)
-}
-
-DiffbindFilterPromoter <- function(DiffbindFile, promoterData, sampleName, direction) {
-  # filters the promoter data based upon whether it matches a different peak and what direction the fold-change is
-  # direction can be: "both", "pos", "neg", "separate". If direction is NA, use "both".
-    Diffbind <- read.delim(DiffbindFile)
-    Diffbind <- Diffbind[which(Diffbind$FDR < 0.05),]
-    if ((direction == "both") | is.na(direction)) {
-      promoterData2 <- filterPromoter(Diffbind, promoterData, sampleName)
-    } else if (direction == "pos") {
-      sampleName <- paste0(sampleName, "_pos")
-      Diffbind <- Diffbind[which(Diffbind$Fold > 0),]
-      promoterData2 <- filterPromoter(Diffbind, promoterData, sampleName)
-    } else if (direction == "neg") {
-      sampleName <- paste0(sampleName, "_neg")
-      Diffbind <- Diffbind[which(Diffbind$Fold < 0),]
-      promoterData2 <- filterPromoter(Diffbind, promoterData, sampleName)
-    } else {
-      sampleNameP <- paste0(sampleName, "_pos")
-      DiffbindP <- Diffbind[which(Diffbind$Fold > 0),]
-      promoterDataP <- filterPromoter(DiffbindP, promoterData, sampleNameP)
-      sampleNameN <- paste0(sampleName, "_neg")
-      DiffbindN <- Diffbind[which(Diffbind$Fold < 0),]
-      promoterDataN <- filterPromoter(DiffbindN, promoterData, sampleNameN)
-      promoterData2 <- rbind(promoterDataP, promoterDataN)
-    }
-return(promoterData2)
-}
-
-createPromoterTable <- function(promoterData) {
-  # making final output table
-  PromoterTable <- data.frame( table(promoterData[,c("gene_id", "sample_id")] ) )
-  PromoterTable2 <- merge( unique(promoterData[,c("gene_id", "gene_name")] ), PromoterTable)
-  PromoterTable3 <- tidyr::pivot_wider(PromoterTable2, names_from="sample_id", values_from="Freq")
-  return(PromoterTable3)
-}
-
-promoterAnnotationByGene <- function(allhitsFiles, sampleNames, diffbindFiles=NA, direction=NA) {
-  # the main function
-  if ( length(allhitsFiles) != length(sampleNames) ) {
-    print("Number of allhits files and sample names don't match.")
-  } else {
-    if ( (length(allhitsFiles) != length(diffbindFiles)) & (sum(is.na(diffbindFiles)) != 1) ) {
-      print("Number of allhits files and diffbind files don't match.")
-    } else {
-      if ( length(allhitsFiles) == 1 ) {
-        promoterData <- allhits2promoter(allhitsFiles)
-        if (is.na(diffbindFiles)) {
-          promoterData$sample_id <- sampleNames
-        } else {
-          promoterData <- DiffbindFilterPromoter(diffbindFiles, promoterData, sampleNames, direction)
-        } 
-      } else {
-        for ( a in 1:length(allhitsFiles) ) {
-          print(a)
-          tmpA <- allhits2promoter(allhitsFiles[a])
-          if (sum(is.na(diffbindFiles)) ==1) {
-            tmpA$sample_id <- sampleNames[a]
-          } else {
-            tmpA <- DiffbindFilterPromoter(diffbindFiles[a], tmpA, sampleNames[a], direction)
-          }
-          if (a == 1) {
-            promoterData <- tmpA
-          } else {
-            promoterData <- rbind(promoterData, tmpA)
-          }
-        }
-      }
-    }
-    promoterTable <- createPromoterTable(promoterData)
-    return(promoterTable)
-  }   
-}     
-
-peakcallVersion <- function(inFolder,outFile) {
-# currently only works for macs outputs
-# inFolder here is the folder where the uropa output files are located
-  filesA <- list.files(path=inFolder,pattern="allhits.txt")
-  samples <- matrix(unlist(strsplit(filesA,"_macs")),ncol=2,byrow=T)[,1]
-  filesA <- list.files(path=inFolder,pattern="allhits.txt",full.names = T)
-  promoterInfo <- promoterAnnotationByGene(allhitsFiles=filesA, sampleNames=samples)
-  write.table(promoterInfo, outFile, quote=F,sep="\t",row.names=F)
-}
-
-diffbindVersion <- function(inFolder,outFile) {
-# currently designed for macs peaks, analyzed by deseq2
-# analyzing both positive and negative together for now
-# inFolder here is the root working directory for the project
-  uropaFolder <- paste0(inFolder, "/UROPA_annotations/DiffBind")
-  diffbindFolder <- paste0(inFolder, "/DiffBind")
-  filesU <- list.files(path=uropaFolder, pattern="DiffbindDeseq2_uropa_protTSS_allhits.txt")
-  samples <- matrix(unlist(strsplit(filesU,"-macs")),ncol=2,byrow=T)[,1]
-  filesU <- list.files(path=uropaFolder, pattern="DiffbindDeseq2_uropa_protTSS_allhits.txt",full.names=T)
-  filesD <- list.files(path=diffbindFolder, pattern="Deseq2.txt",full.names=T,recursive=T)
-  promoterInfo <- promoterAnnotationByGene(allhitsFiles=filesU, 
-                     sampleNames=samples, diffbindFiles=filesD, direction="both")
-  write.table(promoterInfo, outFile, quote=F,sep="\t",row.names=F)
-}
diff --git a/workflow/scripts/significantPathways.R b/workflow/scripts/significantPathways.R
deleted file mode 100755
index dd9af36..0000000
--- a/workflow/scripts/significantPathways.R
+++ /dev/null
@@ -1,127 +0,0 @@
-####################
-#
-# Name: significantPathways.R
-# Created by: Tovah Markowitz, PhD
-# Bioinformatics (NCBR)/ Integrated Data Sciences Section (IDSS)
-# Research Technologies Branch/DIR/NIAID
-#
-# Created: August 9, 2022
-# Updated: October 28, 2022 to make reactomePA optional
-# Updated: November 4, 2022 to accept a txt file or a gtf for the background genes
-#                           also to accept a promoter annotation table and analyze every column
-# 
-####################
-#
-# Purpose: To take a list of genes and find the significant KEGG or Reactome
-#          pathways using overenrichment analysis. See details for specialized functionality.
-#
-# Requires: clusterProfiler, ReactomePA, enrichplot, org.Hs.eg.db, rtracklayer, ggplot2, and ggprism
-#
-# Details: Takes input gene lists as Ensembl gene IDs or gene symbols, converts to
-#          Entrez gene IDs, and runs ORA against KEGG or Reactome database. Requires
-#          a background gene list as cfChIP currently ignores chrs X, Y, and M.
-#          Outputs a dataframe of significant pathways, a pdf of the top most
-#          significant pathways, or a pdf of just the pathways of interest (if significant).
-#
-# Function: significantPathways(Genes, bkgGeneTXT, database="KEGG", PDFfile=NA, pathwayVector=NA)
-#
-# Variables:
-#   Genes:         [Required] a vector of the genes to be analyzed through ORA
-#   bkgGeneFILE:   [Required] a txt file containing a column of Ensembl IDs listing 
-#                  the appropriate background gene set or the gtf file used for the uropa
-#                  annotations 
-#                  For example: hg19.ensembl.prot_coding.with_annotations.txt
-#   database:      [Optional] whether to compare to the KEGG or Reactome database
-#                  default: KEGG
-#   PDFfile:       [Optional] name of the PDF file to create, if empty no PDF will be made
-#   pathwayVector: [Optional] a vector of pathways (descriptions or IDs) to plot in the pdf.
-#                  If PDFfile is empty, it is ignored. If this is empty and PDFfile is not,
-#                  pdf plot will be of the top 30 most significant pathways instead.
-#
-# Example usage:
-#   source("significantPathways.R")
-#   out <- significantPathways(Genes= c("GeneA","GeneB"), 
-#                              bkgGeneFILE= "hg19.ensembl.prot_coding.with_annotations.txt",
-#                              database="KEGG", PDFfile="a.pdf", 
-#                              pathwayVector=c("pathwayA", "pathwayB"))
-# 
-####################
-
-library(clusterProfiler)
-library(enrichplot)
-library(ggplot2)
-library(ggprism)
-
-makeBarplotTop <- function(inData, titleName, PDFfile) {
-  inDataCount <- sum(inData@result$p.adjust < 0.1)
-  if (inDataCount > 30) { inDataCount = 30 } 
-  if (inDataCount > 0) {
-    pdf(PDFfile)
-    print(barplot(inData, showCategory = inDataCount,
-                  label_format=70, title=titleName, x="GeneRatio") + 
-            theme_prism(base_size =8) + theme(legend.title = element_text()) )
-  }
-  dev.off()
-}
-
-makeBarPlotSelect <- function(inData, titleName, PDFfile, categories) {
-  pdf(PDFfile)
-  print(barplot(inData, showCategory = categories,
-                label_format=70, title=titleName, x="GeneRatio") + 
-          theme_prism(base_size = 8) + theme(legend.title = element_text()) )
-  dev.off()
-  }
-
-processGenes <- function(geneIDs) {
-  if (grepl("^ENSG", geneIDs[1])) {
-    ensIDs <- gsub("\\.[0-9]+", "", geneIDs, perl=T)
-    entrezIDs <- bitr(ensIDs, from= "ENSEMBL", toType="ENTREZID", OrgDb="org.Hs.eg.db")
-  } else {
-    entrezIDs <- bitr(ensIDs, from= "SYMBOL", toType="ENTREZID", OrgDb="org.Hs.eg.db")
-  }
-  entrezIDs <- entrezIDs$ENTREZID
-  return(entrezIDs)
-}
-
-significantPathways <- function(Genes, bkgGeneFILE, database="KEGG", PDFfile=NA, pathwayVector=NA) {
-  sigGenes <- processGenes(Genes)
-  if (grepl("gtf",bkgGeneFILE)) {
-    bkgGenesData <- rtracklayer::import(bkgGeneFILE)
-    bkgGenes <- unique(bkgGenesData$gene_id)
-  } else {
-    bkgGenesData <- read.delim(bkgGeneFILE)
-    bkgGenes <- bkgGenesData[,grep("^ENSG", bkgGenesData[1,])]
-  }
-  backgroundGenes <- processGenes(bkgGenes)
-  if (database == "KEGG") {
-    pathwaySig <- enrichKEGG(sigGenes, organism= "hsa", keyType="kegg", universe=backgroundGenes, use_internal_data=TRUE)
-    pathwayData <- pathwaySig@result[which(pathwaySig@result$p.adjust < 0.1),]
-  } else {
-    library(ReactomePA)
-    pathwaySig <- enrichPathway(sigGenes, readable=T, universe=backgroundGenes)
-    pathwayData <- pathwaySig@result[which(pathwaySig@result$p.adjust < 0.1),]
-  }
-  if (!is.na(PDFfile)) {
-    if(length(pathwayVector) != 1) {
-      if (length(grep("HSA", pathwayVector, ignore.case=T)) != 0) {
-        pathwayVector <- pathwayData$Description[which(pathwayData$ID %in% pathwayVector)]
-      }
-      makeBarPlotSelect(inData=pathwaySig, titleName=database, PDFfile=PDFfile, categories=pathwayVector)
-    } else {
-      makeBarplotTop(inData=pathwaySig, titleName=database, PDFfile=PDFfile)
-    }
-  }
-  return(pathwayData)
-}
-
-promoterAnnotationWrapper <- function(promoterFile, bkgGeneFILE, database="KEGG") {
-   promoterData <- read.delim(promoterFile)
-   outFolder <- dirname(promoterFile)
-   for (i in 3:ncol(promoterData)) {
-      colName <- names(promoterData)[i]
-      Genes <- promoterData$gene_id[which(promoterData[,i] > 0)]
-      outData <- significantPathways(Genes, bkgGeneFILE, database)
-      outFileName <- paste0(outFolder,"/",colName,"_",database,".txt")
-      write.table(outData, outFileName, quote=F, row.names=F, sep="\t")
-   }
-}

From 55ca7198c46ac641670d791bf8656fe362e8c151 Mon Sep 17 00:00:00 2001
From: Ryan Routsong <routsongrm@nih.gov>
Date: Mon, 8 Jul 2024 18:24:35 -0400
Subject: [PATCH 03/28] fix: correct diffbindedger output paths

---
 src/run.py                 |   6 +-
 workflow/Snakefile         |  90 ++++++-------
 workflow/rules/dba.smk     | 266 ++++++++++++++++++++-----------------
 workflow/scripts/common.py |  12 +-
 4 files changed, 202 insertions(+), 172 deletions(-)

diff --git a/src/run.py b/src/run.py
index 34fc423..16be94e 100644
--- a/src/run.py
+++ b/src/run.py
@@ -19,7 +19,7 @@
 from . import version as __version__
 
 
-def init(repo_path, output_path, links=[], required=['workflow', 'resources', 'config']):
+def init(repo_path, output_path, links=[], required=['workflow', 'bin', 'resources', 'config']):
     """Initialize the output directory. If user provides a output
     directory path that already exists on the filesystem as a file 
     (small chance of happening but possible), a OSError is raised. If the
@@ -207,7 +207,7 @@ def setup(sub_args, ifiles, repo_path, output_path):
     # Add other runtime info for debugging
     config['project']['version'] = __version__
     config['project']['workpath'] = os.path.abspath(sub_args.output)
-    config['project']['binpath'] = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'bin'))
+    config['project']['binpath'] = os.path.abspath(os.path.join(config['project']['workpath'], '..', 'bin'))
     git_hash = git_commit_hash(repo_path)
     config['project']['git_commit_hash'] = git_hash   # Add latest git commit hash
     config['project']['pipeline_path'] = repo_path    # Add path to installation
@@ -610,6 +610,8 @@ def dryrun(outdir, config='config.json', snakefile=os.path.join('workflow', 'Sna
         dryrun_output = subprocess.check_output([
             'snakemake', '-npr',
             '-s', str(snakefile),
+            '--verbose',
+            '--debug-dag',
             '--use-singularity',
             '--rerun-incomplete',
             '--cores', str(256),
diff --git a/workflow/Snakefile b/workflow/Snakefile
index 026c56d..6d1bce7 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -85,62 +85,60 @@ if assay == "cfchip":
         join(uropa_dir, "promoterTable1", "{PeakTool}_promoter_overlap_summaryTable.txt"), 
         PeakTool=PeakTools
     ))
-    # rule_all_ins.extend(expand(
-    #     join(diffbind_dir, "{name}_{PeakTool}_uropa_{_type}_allhits.txt"),
-    #     PeakTool=['DiffbindEdgeR','DiffbindDeseq2'], 
-    #     name=contrasts, 
-    #     _type=peak_types
-    # ))
+
+
     if reps:
         rule_all_ins.extend(expand(
             join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind.html"), 
             group1=zipGroup1, group2=zipGroup2, PeakTool=zipToolC
         ))
         rule_all_ins.extend(expand(
-            join(diffbind_dir, "{name}_{PeakTool}_uropa_{type}_allhits.txt"), PeakTool=['DiffbindEdgeR','DiffbindDeseq2'], 
-            name=contrasts, _type=peak_types
+            join(uropa_dir, "{name}_{PeakTool}_uropa_{type}_allhits.txt"), PeakTool=['DiffbindEdgeR','DiffbindDeseq2'], 
+            name=contrasts, _type=["protTSS"]
         ))
-elif assay in ["atac", "chip"]:
-    peak_types.extend(["prot", "protSEC", "genes"])
-    rule_all_ins.extend(expand(join(MEME_dir, "{PeakTool}", "{name}_meme", "meme-chip.html"), PeakTool=PeakTools, name=chips))
-    rule_all_ins.extend(expand(join(MEME_dir, "{PeakTool}", "{name}_ame", "ame.html"), PeakTool=PeakTools, name=chips))
-    if paired_end:
-        rule_all_ins.extend(expand(join(qc_dir, "{name}.{stem}.insert_size_metrics.txt"), name=samples, stem=file_stems))
-    if assay == "chip":
-        rule_all_ins.extend(expand(join(macsB_dir, "{name}", "{name}_peaks.broadPeak"), name=chips))
-        rule_all_ins.extend(expand(join(sicer_dir, "{name}", "{name}_broadpeaks.bed"), name=chips))
+
+    elif assay in ["atac", "chip"]:
+        peak_types.extend(["prot", "protSEC", "genes"])
+        rule_all_ins.extend(expand(join(MEME_dir, "{PeakTool}", "{name}_meme", "meme-chip.html"), PeakTool=PeakTools, name=chips))
+        rule_all_ins.extend(expand(join(MEME_dir, "{PeakTool}", "{name}_ame", "ame.html"), PeakTool=PeakTools, name=chips))
         if paired_end:
-            short_ext, tag_ext = ["sorted", "Q5DD"], ["sorted", "Q5DD_tagAlign"]
-            rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.ppqt"), name=samples, ext=short_ext))
-            rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.pdf"), name=samples, ext=short_ext))
-            rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.ppqt.txt"), name=samples, ext=short_ext))
-            rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.ppqt"), name=samples, ext=tag_ext))
-            rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.pdf"), name=samples, ext=tag_ext))
-            rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.ppqt.txt"), name=samples, ext=tag_ext))
-    if assay == "atac":
-        rule_all_ins.extend(expand(
-            join(genrich_dir, "{name}", "{name}.narrowPeak"), name=chips
-        ))
-    if reps:
-        rule_all_ins.extend(expand(
-            join(uropa_dir, "{PeakTool}", "{name}_{PeakTool}_uropa_{type}_allhits.txt"),
-            PeakTool=PeakTools, name=chips, _type=peak_types
-        ))
-        rule_all_ins.extend(expand(
-            join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind.html"), 
-            group1=zipGroup1, group2=zipGroup2, PeakTool=zipToolC
-        ))
-        # rule_all_ins.extend(expand(
-        #     join(uropa_dir, "{name}_{PeakTool}_uropa_{type}_allhits.txt"), 
-        #     PeakTool=["DiffbindEdgeR", "DiffbindDeseq2"], 
-        #     name=contrasts, 
-        #     _type=peak_types
-        # ))
-        if contrast:
+            rule_all_ins.extend(expand(join(qc_dir, "{name}.{stem}.insert_size_metrics.txt"), name=samples, stem=file_stems))
+        if assay == "chip":
+            rule_all_ins.extend(expand(join(macsB_dir, "{name}", "{name}_peaks.broadPeak"), name=chips))
+            rule_all_ins.extend(expand(join(sicer_dir, "{name}", "{name}_broadpeaks.bed"), name=chips))
+            if paired_end:
+                short_ext, tag_ext = ["sorted", "Q5DD"], ["sorted", "Q5DD_tagAlign"]
+                rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.ppqt"), name=samples, ext=short_ext))
+                rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.pdf"), name=samples, ext=short_ext))
+                rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.ppqt.txt"), name=samples, ext=short_ext))
+                rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.ppqt"), name=samples, ext=tag_ext))
+                rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.pdf"), name=samples, ext=tag_ext))
+                rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.ppqt.txt"), name=samples, ext=tag_ext))
+        elif assay == "atac":
             rule_all_ins.extend(expand(
-                join(uropa_dir, "promoterTable2", "DiffbindDeseq2_{PeakTool}_promoter_overlap_summaryTable.txt"), 
-                PeakTool=PeakTools
+                join(genrich_dir, "{name}", "{name}.narrowPeak"), name=chips
             ))
+        if reps:
+            rule_all_ins.extend(expand(
+                join(uropa_dir, "{PeakTool}", "{name}_{PeakTool}_uropa_{type}_allhits.txt"),
+                PeakTool=PeakTools, name=chips, _type=peak_types
+            ))
+            rule_all_ins.extend(expand(
+                join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind.html"), 
+                group1=zipGroup1, group2=zipGroup2, PeakTool=zipToolC
+            ))
+            rule_all_ins.extend(expand(
+                join(uropa_dir, "{name}_{PeakTool}_uropa_{type}_allhits.txt"), 
+                PeakTool=["DiffbindEdgeR", "DiffbindDeseq2"], 
+                name=contrasts, 
+                _type=peak_types
+            ))
+            if contrast:
+                rule_all_ins.extend(expand(
+                    join(uropa_dir, "promoterTable2", "DiffbindDeseq2_{PeakTool}_promoter_overlap_summaryTable.txt"), 
+                    PeakTool=PeakTools
+                ))
+
 rule_all_ins.append(join(workpath,"multiqc_report.html"))
 rule_all_ins.extend(expand(join(qc_dir, "{name}.preseq.dat"), name=samples))
 rule_all_ins.extend(
diff --git a/workflow/rules/dba.smk b/workflow/rules/dba.smk
index 69aecc4..d767f57 100644
--- a/workflow/rules/dba.smk
+++ b/workflow/rules/dba.smk
@@ -1,18 +1,22 @@
 # Differential binding analysis rules
 # ~~~~
-from os.path import join
 import os
-from scripts.common import allocated, mk_dir_if_not_exist
+import json
+from os.path import join
+from scripts.common import allocated, mk_dir_if_not_exist, test_combine
 from scripts.peakcall import outputIDR, zip_peak_files, calc_effective_genome_fraction
 from scripts.blocking import test_for_block
 
 
 # ~~ workflow configuration
 workpath                        = config['project']['workpath']
+bin_path                        = config['project']['binpath']
 genome                          = config['options']['genome']
 blocks                          = config['project']['blocks']
 groupdata                       = config['project']['groups']
-
+contrast                        = config['project']['contrast']
+uropaver                        = config['tools']['UROPAVER']
+gtf                             = config['references'][genome]['GTFFILE']
 
 # ~~ directories
 bin_path                        = join(workpath, "workflow", "bin")
@@ -31,8 +35,6 @@ otherDirs                       = [qc_dir, homer_dir, uropa_dir]
 cfTool_dir                      = join(workpath, "cfChIPtool")
 cfTool_subdir2                  = join(cfTool_dir, "BED", "H3K4me3")
 
-
-
 # ~~ workflow switches
 blocking                        = False if None in list(blocks.values()) else True
 if reps == "yes": otherDirs.append(diffbind_dir)
@@ -86,136 +88,154 @@ contrastBlock = test_for_block(groupdata, contrast, blocks)
 zipGroup1B, zipGroup2B, zipToolCB, contrastsB = zip_contrasts(contrastBlock, PeakTools)
 
 # ~~ rules 
-
 rule diffbind:
     input:
         lambda w: [ join(workpath, w.PeakTool, chip, chip + PeakExtensions[w.PeakTool]) for chip in chips ]
     output:
-        html = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind.html"),
-        Deseq2 = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2.bed"),
-        EdgeR = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR.bed"),
-        EdgeR_txt = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR.txt"),
-        Deseq2_txt = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2.txt"),
-        EdgeR_ftxt = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR_fullList.txt"),
-        Deseq2_ftxt = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2_fullList.txt"),
-        html_block = provided(join(diffbind_dir_block, "{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind_blocking.html"), blocking)
+        html                            = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind.html"),
+        Deseq2                          = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2.bed"),
+        EdgeR                           = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR.bed"),
+        EdgeR_txt                       = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR.txt"),
+        Deseq2_txt                      = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2.txt"),
+        EdgeR_ftxt                      = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR_fullList.txt"),
+        Deseq2_ftxt                     = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2_fullList.txt"),
+        html_block                      = provided(join(diffbind_dir_block, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_blocking.html"), blocking)
     params:
-        rname = "diffbind",
-        rscript = join(workpath, "workflow", "scripts","DiffBind_v2_ChIPseq.Rmd"),
-        outdir    = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}"),
-        contrast  = "{group1}_vs_{group2}",
-        csvfile   = join(workpath,diffbind_dir,"{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind_prep.csv"),
-        pythonscript = join(workpath,"workflow","scripts","prep_diffbind.py"),
-        PeakExtension= lambda w: PeakExtensions[w.PeakTool],
-        peakcaller= lambda w: FileTypesDiffBind[w.PeakTool],
-        group1="{group1}",
-        group2="{group2}",
-        PeakTool="{PeakTool}",
-        blocking=blocking,
-        blocking_rscript = join(workpath,"workflow","scripts","DiffBind_v2_ChIPseq_block.Rmd"),
-        outdir_block= join(workpath,diffbind_dir_block,"{group1}_vs_{group2}-{PeakTool}"),
-        Deseq2_block = provided(join(workpath, diffbind_dir_block,"{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2_block.bed"), blocking),
-        EdgeR_block = provided(join(workpath, diffbind_dir_block,"{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR_block.bed"), blocking),
+        # variables and wildcards used in the shell directive
+        rname                           = "diffbind",
+        group1                          = "{group1}",
+        group2                          = "{group2}",
+        this_peaktool                   = "{PeakTool}",
+        this_contrast                   = "{group1}_vs_{group2}",
+        this_peakextension              = lambda w: PeakExtensions[w.PeakTool],
+        peakcaller                      = lambda w: FileTypesDiffBind[w.PeakTool],
+        # scripts in the bin directory used in the shell directive
+        rscript                         = join(bin_path, "DiffBind_v2_ChIPseq.Rmd"),
+        pythonscript                    = join(bin_path, "prep_diffbind.py"),
+        blocking_rscript                = join(bin_path, "DiffBind_v2_ChIPseq_block.Rmd"),
+        # output base directories or full file locations
+        outdir                          = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}"),
+        csvfile                         = join(
+                                            diffbind_dir, 
+                                            "{group1}_vs_{group2}-{PeakTool}",
+                                            "{group1}_vs_{group2}-{PeakTool}_Diffbind_prep.csv"
+                                        ),
+        Deseq2_block                    = provided(join(
+                                            diffbind_dir_block, 
+                                            "{group1}_vs_{group2}-{PeakTool}", 
+                                            "{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2_block.bed"
+                                        ), blocking),
+        EdgeR_block                     = provided(join(
+                                            diffbind_dir_block, 
+                                            "{group1}_vs_{group2}-{PeakTool}", 
+                                            "{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR_block.bed"
+                                        ), blocking),
+        outdir_block                    = join(diffbind_dir_block, "{group1}_vs_{group2}-{PeakTool}"),
     container:
         config['images']['cfchip']
-    shell: """
-    python {params.pythonscript} --g1 {params.group1} --g2 {params.group2} --wp {workpath} \
-         --pt {params.PeakTool} --pe {params.PeakExtension} --bd {bam_dir} \
-         --pc {params.peakcaller} --csv {params.csvfile}
-    cp {params.rscript} {params.outdir}
-    cd {params.outdir}
-    Rscript -e 'rmarkdown::render("DiffBind_v2_ChIPseq.Rmd", output_file= "{output.html}", 
-    params=list(csvfile= "{params.csvfile}", contrasts= "{params.contrast}", peakcaller= "{params.PeakTool}"))'
-    if [ ! -f {output.Deseq2} ]; then touch {output.Deseq2}; fi
-    if [ ! -f {output.EdgeR} ]; then touch {output.EdgeR}; fi
-
-    if [ '{params.blocking}' == True ]; then
-        echo "DiffBind with Blocking"
-        Rscript -e 'rmarkdown::render("{params.blocking_rscript}", output_file= "{output.html_block}", 
-        params=list(csvfile= "{params.csvfile}", contrasts= "{params.contrast}", peakcaller= "{params.PeakTool}", dir= "{params.outdir_block}"))'
-        if [ ! -f {params.Deseq2_block} ]; then touch {params.Deseq2_block}; fi
-        if [ ! -f {params.EdgeR_block} ]; then touch {params.EdgeR_block}; fi
-    fi
-    """
-
-
-if assay == "cfchip":
-    rule UROPA:
-        input:
-            lambda w: [ join(workpath, w.PeakTool1, w.name, w.name + PeakExtensions[w.PeakTool2]) ]
-        output:
-            txt=join(uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_allhits.txt'),
-            bed1=temp(join(uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_allhits.bed')),
-            bed2=temp(join(uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_finalhits.bed')),
-        params:
-            rname="uropa",
-            uropaver = config['tools']['UROPAVER'],
-            fldr = join(uropa_dir, '{PeakTool1}'),
-            json = join(uropa_dir, '{PeakTool1}','{name}.{PeakTool2}.{type}.json'),
-            outroot = join(uropa_dir, '{PeakTool1}','{name}_{PeakTool2}_uropa_{type}'),
-            gtf = config['references'][genome]['GTFFILE'],
-            threads = 4,
-        shell: """
-        module load {params.uropaver};
-        # Dynamically creates UROPA config file
-        if [ ! -e {params.fldr} ]; then mkdir {params.fldr}; fi
-        echo '{{"queries":[ ' > {params.json}
-        if [ '{wildcards.type}' == 'protTSS' ]; then
-             echo '      {{ "feature":"gene","distance":3000,"filter.attribute":"gene_type","attribute.value":"protein_coding","feature.anchor":"start" }},' >> {params.json}
-             echo '      {{ "feature":"gene","distance":10000,"filter.attribute":"gene_type","attribute.value":"protein_coding","feature.anchor":"start" }},' >> {params.json}
-             echo '      {{ "feature":"gene","distance":100000,"filter.attribute":"gene_type","attribute.value":"protein_coding","feature.anchor":"start" }}],' >> {params.json}
-        fi
-        echo '"show_attributes":["gene_id", "gene_name","gene_type"],' >> {params.json}
-        echo '"priority":"Yes",' >> {params.json}
-        echo '"gtf":"{params.gtf}",' >> {params.json}
-        echo '"bed": "{input}" }}' >> {params.json}
-        uropa -i {params.json} -p {params.outroot} -t {params.threads} -s
+    shell: 
         """
-else:
-    rule UROPA:
-        input:
-            lambda w: [ join(workpath, w.PeakTool1, w.name, w.name + PeakExtensions[w.PeakTool2]) ]
-        output:
-            txt=join(uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_allhits.txt'),
-            bed1=temp(join(uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_allhits.bed')),
-            bed2=temp(join(uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_finalhits.bed')),
-        params:
-            rname="uropa",
-            uropaver = config['tools']['UROPAVER'],
-            fldr = join(uropa_dir, '{PeakTool1}'),
-            json = join(uropa_dir, '{PeakTool1}','{name}.{PeakTool2}.{type}.json'),
-            outroot = join(uropa_dir, '{PeakTool1}','{name}_{PeakTool2}_uropa_{type}'),
-            gtf = config['references'][genome]['GTFFILE'],
-            threads = 4,
-        shell: """
-        module load {params.uropaver};
-        # Dynamically creates UROPA config file
-        if [ ! -e {params.fldr} ]; then mkdir {params.fldr}; fi
-        echo '{{"queries":[ ' > {params.json}
-        if [ '{wildcards.type}' == 'prot' ]; then
-             echo '      {{ "feature":"gene","distance":5000,"filter.attribute":"gene_type","attribute.value":"protein_coding" }},' >> {params.json}
-             echo '      {{ "feature":"gene","distance":100000,"filter.attribute":"gene_type","attribute.value":"protein_coding" }}],' >> {params.json}
-        elif [ '{wildcards.type}' == 'genes' ]; then
-             echo '      {{ "feature":"gene","distance":5000 }},' >> {params.json}
-             echo '      {{ "feature":"gene","distance":100000 }}],' >> {params.json}
-        elif [ '{wildcards.type}' == 'protSEC' ]; then
-             echo '      {{ "feature":"gene","distance":[3000,1000],"filter.attribute":"gene_type","attribute.value":"protein_coding","feature.anchor":"start" }},' >> {params.json}
-             echo '      {{ "feature":"gene","distance":3000,"filter.attribute":"gene_type","attribute.value":"protein_coding","feature.anchor":"end" }},' >> {params.json}
-             echo '      {{ "feature":"gene","distance":100000,"filter.attribute":"gene_type","attribute.value":"protein_coding","feature.anchor":"center" }},' >> {params.json}
-             echo '      {{ "feature":"gene","distance":100000,"filter.attribute":"gene_type","attribute.value":"protein_coding" }}],' >> {params.json}
-        elif [ '{wildcards.type}' == 'protTSS' ]; then
-             echo '      {{ "feature":"gene","distance":[3000,1000],"filter.attribute":"gene_type","attribute.value":"protein_coding","feature.anchor":"start" }},' >> {params.json}
-             echo '      {{ "feature":"gene","distance":10000,"filter.attribute":"gene_type","attribute.value":"protein_coding","feature.anchor":"start" }},' >> {params.json}
-             echo '      {{ "feature":"gene","distance":100000,"filter.attribute":"gene_type","attribute.value":"protein_coding","feature.anchor":"start" }}],' >> {params.json}
+        python {params.pythonscript} --g1 {params.group1} --g2 {params.group2} --wp {workpath} \
+            --pt {params.this_peaktool} --pe {params.this_peakextension} --bd {bam_dir} \
+            --pc {params.peakcaller} --csv {params.csvfile}
+        cp {params.rscript} {params.outdir}
+        cd {params.outdir}
+        Rscript -e 'rmarkdown::render("DiffBind_v2_ChIPseq.Rmd", output_file= "{output.html}", 
+        params=list(csvfile= "{params.csvfile}", contrasts= "{params.contrast}", peakcaller= "{params.this_peaktool}"))'
+        if [ ! -f {output.Deseq2} ]; then touch {output.Deseq2}; fi
+        if [ ! -f {output.EdgeR} ]; then touch {output.EdgeR}; fi
 
+        if [ '{params.blocking}' == True ]; then
+            echo "DiffBind with Blocking"
+            Rscript -e 'rmarkdown::render("{params.blocking_rscript}", output_file= "{output.html_block}", 
+            params=list(csvfile= "{params.csvfile}", contrasts= "{params.this_contrast}", peakcaller= "{params.this_peaktool}", dir= "{params.outdir_block}"))'
+            if [ ! -f {params.Deseq2_block} ]; then touch {params.Deseq2_block}; fi
+            if [ ! -f {params.EdgeR_block} ]; then touch {params.EdgeR_block}; fi
         fi
-        echo '"show_attributes":["gene_id", "gene_name","gene_type"],' >> {params.json}
-        echo '"priority":"Yes",' >> {params.json}
-        echo '"gtf":"{params.gtf}",' >> {params.json}
-        echo '"bed": "{input}" }}' >> {params.json}
-        uropa -i {params.json} -p {params.outroot} -t {params.threads} -s
         """
 
+
+rule UROPA:
+    input:
+        lambda w: [ join(workpath, w.PeakTool1, w.name, w.name + PeakExtensions[w.PeakTool2]) ],
+    output:
+        txt                             = join(uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_allhits.txt'),
+        bed1                            = temp(join(uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_allhits.bed')),
+        bed2                            = temp(join(uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_finalhits.bed')),
+    params:
+        rname                           = "uropa",
+        fldr                            = join(uropa_dir, '{PeakTool1}'),
+        json                            = join(uropa_dir, '{PeakTool1}', '{name}.{PeakTool2}.{type}.json'),
+        outroot                         = join(uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}'),
+    threads: 4,
+    run:
+        # Dynamically creates UROPA config file
+        shell(f"module load {uropaver}")
+        if not os.path.exists("{params.fldr}"): 
+            os.mkdir("{params.fldr}", mode=0o775)
+
+        json_construct = dict()
+        json_construct['queries'] = []
+        json_construct['show_attributes'] = ["gene_id", "gene_name", "gene_type"]
+        json_construct["priority"] = "Yes"
+        json_construct['gtf'] = gtf
+        json_construct['bed'] = "{input}"
+
+        base_query = {
+            "feature": "gene",
+            "filter.attribute": "gene_type",
+            "attribute.value": "protein_coding", 
+            "feature.anchor": "start" 
+        }
+
+        if assay == 'cfchip':
+            if '{type}' == 'protTSS':
+                for _d in (3000, 10000, 100000):
+                    this_q = base_query.copy()
+                    this_q['distance'] = _d
+                    json_construct['queries'].append(this_q)
+        else:
+            if '{type}' == 'prot':
+                for _d in (5000, 100000):
+                    this_q = base_query.copy()
+                    del this_q["feature.anchor"]
+                    this_q['distance'] = _d
+                    json_construct['queries'].append(this_q)
+            elif '{type}' == 'genes':
+                this_query = {}
+                this_query['feature'] = 'gene'
+                for _d in (5000, 100000):
+                    this_q = base_query.copy()
+                    del this_q["feature.anchor"]
+                    del this_q["filter.attribute"]
+                    del this_q["attribute.value"]
+                    this_q['distance'] = _d
+                    json_construct['queries'].append(this_q)
+            elif '{type}' == 'protSEC':
+                #   distance, feature.anchor
+                query_values = (
+                    ([3000, 1000], "start"), 
+                    (3000,         "end"), 
+                    (100000,       "center"), 
+                    (100000,       None)
+                )
+                for _distance, feature_anchor in query_values:
+                    this_q = base_query.copy()
+                    del this_q["feature.anchor"]
+                    if feature_anchor: 
+                        this_q["feature.anchor"] = feature_anchor
+                    this_q['distance'] = _d
+                    json_construct['queries'].append(this_q)
+            elif '{type}' == 'protTSS':
+                for _d in ([3000, 1000], 10000, 100000):
+                    this_q = base_query.copy()
+                    this_q['distance'] = _d
+                    json_construct['queries'].append(this_q)
+        with open('{params.json}', 'w') as jo:
+            json.dump(json_construct, jo, indent=4)
+        shell("uropa -i {params.json} -p {params.outroot} -t {threads} -s")
+
+
 rule manorm:
     input: 
         bam1 = lambda w: join(workpath,bam_dir, groupdata[w.group1][0] + ".Q5DD.bam"),
diff --git a/workflow/scripts/common.py b/workflow/scripts/common.py
index 6feba1c..26f44b6 100644
--- a/workflow/scripts/common.py
+++ b/workflow/scripts/common.py
@@ -258,4 +258,14 @@ def get_fqscreen_outputs(paired_end, samples, qc_dir):
         outs.extend(expand(join(qc_dir, "FQscreen", "{name}.R1.trim_screen.png"), name=samples)),
         outs.extend(expand(join(qc_dir, "FQscreen2", "{name}.R1.trim_screen.txt"), name=samples)),
         outs.extend(expand(join(qc_dir, "FQscreen2", "{name}.R1.trim_screen.png"), name=samples)),
-    return outs
\ No newline at end of file
+    return outs
+
+
+def test_combine(one, two):
+    try:
+        three = one + two
+    except:
+        print(one)
+        print(two)
+        exit()
+    return three
\ No newline at end of file

From 21c4aea38e20ab96f11f5b741ad19fe4d002fc5d Mon Sep 17 00:00:00 2001
From: Ryan Routsong <routsongrm@nih.gov>
Date: Mon, 8 Jul 2024 18:57:18 -0400
Subject: [PATCH 04/28] chore: add back bin scripts, correct pathing for bin
 scripts, formatting

---
 bin/DiffBind_v2_ChIPseq.Rmd       | 260 +++++++++++++++++++++++++++++
 bin/DiffBind_v2_ChIPseq_block.Rmd | 267 ++++++++++++++++++++++++++++++
 bin/DiffBind_v2_cfChIP_QC.Rmd     | 204 +++++++++++++++++++++++
 bin/FRiP_plot.R                   | 112 +++++++++++++
 bin/atac_nrf.py                   |  22 +++
 bin/bam_filter_by_mapq.py         |  40 +++++
 bin/cfChIP_signatures.R           |  97 +++++++++++
 bin/frip.py                       | 164 ++++++++++++++++++
 bin/jaccard_score.py              | 202 ++++++++++++++++++++++
 bin/ppqt_process.py               |  27 +++
 bin/prep_diffbind.py              |  54 ++++++
 bin/prep_diffbindQC.py            |  51 ++++++
 bin/promoterAnnotation_by_Gene.R  | 179 ++++++++++++++++++++
 bin/significantPathways.R         | 127 ++++++++++++++
 workflow/Snakefile                |   1 +
 workflow/rules/cfChIP.smk         | 153 +++++++++--------
 workflow/rules/qc.smk             |  64 +++----
 workflow/scripts/common.py        |  12 +-
 18 files changed, 1925 insertions(+), 111 deletions(-)
 create mode 100644 bin/DiffBind_v2_ChIPseq.Rmd
 create mode 100644 bin/DiffBind_v2_ChIPseq_block.Rmd
 create mode 100644 bin/DiffBind_v2_cfChIP_QC.Rmd
 create mode 100644 bin/FRiP_plot.R
 create mode 100644 bin/atac_nrf.py
 create mode 100644 bin/bam_filter_by_mapq.py
 create mode 100755 bin/cfChIP_signatures.R
 create mode 100644 bin/frip.py
 create mode 100644 bin/jaccard_score.py
 create mode 100644 bin/ppqt_process.py
 create mode 100644 bin/prep_diffbind.py
 create mode 100644 bin/prep_diffbindQC.py
 create mode 100755 bin/promoterAnnotation_by_Gene.R
 create mode 100755 bin/significantPathways.R

diff --git a/bin/DiffBind_v2_ChIPseq.Rmd b/bin/DiffBind_v2_ChIPseq.Rmd
new file mode 100644
index 0000000..031799d
--- /dev/null
+++ b/bin/DiffBind_v2_ChIPseq.Rmd
@@ -0,0 +1,260 @@
+---
+title: "DiffBind: ChIP-seq pipeline"
+output: 
+    html_document:
+        toc: true
+        toc_float:
+           collapsed: false
+        number_sections: true
+        toc_depth: 3
+        fig_width: 7
+        fig_height: 6
+params:
+    csvfile: samplesheet.csv
+    contrasts: "group1_vs_group2"
+    peakcaller: "macs"
+---
+
+<style type="text/css">
+  body{
+  font-size: 12pt;
+}
+</style>
+
+```{r, include=FALSE, warning=FALSE, message=FALSE}
+## grab args
+dateandtime<-format(Sys.time(), "%a %b %d %Y - %X")
+
+csvfile <- params$csvfile
+contrasts <- params$contrasts
+peakcaller <- params$peakcaller
+```
+
+**Groups being compared:**
+    *`r contrasts`*  
+**Peak sources:**
+    *`r peakcaller`*  
+**Report generated:** 
+    *`r dateandtime`*  
+
+```{r setup, echo=FALSE, warning=FALSE,message=FALSE}
+knitr::opts_chunk$set(echo = FALSE, include=TRUE, message=FALSE, warning=FALSE, error=FALSE)
+suppressMessages(library(DT))
+suppressMessages(library(DiffBind))
+suppressMessages(library(parallel))
+```
+
+# Peak Data
+Read in sample sheet information and peak information
+```{r samples} 
+samples <- dba(sampleSheet=csvfile)
+consensus <- dba.peakset(samples,consensus=DBA_CONDITION)
+print(samples)
+```
+
+## Correlation heatmap: Only peaks
+Pearson correlation of peak positions: all samples versus all samples  
+```{r heatmap1}
+try(dba.plotHeatmap(samples,main="",cexRow=1,cexCol=1),silent=TRUE)
+```
+
+## PCA: Only peaks
+Variance of peak positions  
+```{r PCA1, fig.height=5,fig.width=5}
+try(dba.plotPCA(samples,DBA_CONDITION),silent=TRUE)
+```
+
+## Overlapping peak counts
+Number of overlapping peaks.  
+If the number of samples is greater than 4, a "consensus" peak Venn diagram is created, where
+the consensus peak set are the peaks identified in at least 2 samples for that condition. This is different
+from the consensus peak set used for differential analyses.
+```{r Venn, fig_height=4}
+if (nrow(samples$samples) < 5) {
+   dba.plotVenn(samples,1:nrow(samples$samples)) 
+} else {
+   dba.plotVenn(consensus,consensus$masks$Consensus,main="Binding Site Overlaps: 'consensus', comparing between groups")
+   try(dba.plotVenn(samples,samples$masks[[3]],main="Binding Site Overlaps: samples in Group1"),silent=TRUE)
+   try(dba.plotVenn(samples,samples$masks[[4]],main="Binding Site Overlaps: samples in Group2"),silent=TRUE)
+}
+```
+
+# Consensus peaks and counts
+Consensus peaks are peaks found in at least two samples, independent of condition.
+FRiP is of consensus peaks and will not match FRiP values calculated outside of this tool.
+```{r peaksORsummits}
+if ( grepl("narrow",samples$samples$Peaks[1]) ) {
+   summits <- TRUE
+   print ("Narrow peak calling tool.")
+   print ("Differential peaks are 250bp upstream and downstream of the summits.")
+} else if ( grepl("broad",samples$samples$Peaks[1]) ) {
+  summits <- FALSE
+  print ("Broad peak calling tool.")
+  print ("Differential peaks are consensus peaks.")
+} else {
+  summits <- FALSE
+  print ("Indeterminate peak calling tool.")
+  print ("Differential peaks are consensus peaks.")
+}
+```
+
+```{r DBcount}
+if (summits == TRUE) {
+	DBdataCounts <- dba.count(samples, summits=250)
+} else {
+	DBdataCounts <- dba.count(samples)
+}
+print(DBdataCounts)
+outfile2 <- paste0(contrasts, "-", peakcaller,"_Diffbind_consensusPeaks.bed")
+consensus2 <- dba.peakset(DBdataCounts,bRetrieve=T)
+consensus2$name <- paste0("Peak",1:length(consensus2))
+#rtracklayer::export(consensus2,outfile2)
+```
+
+## Correlation heatmap: Peaks and reads
+Pearson correlation of library-size normalized counts of consensus peaks: all samples versus all samples
+```{r heatmap2}
+try(dba.plotHeatmap(DBdataCounts,main="",cexRow=1,cexCol=1),silent=TRUE)
+```
+
+## Heatmap: Average signal across each peak
+1000 most variable consensus peaks (library-size normalized counts)
+```{r heatmap3}
+try(dba.plotHeatmap(DBdataCounts,correlations=FALSE,cexRow=1,cexCol=1),silent=TRUE)
+```
+
+## PCA: Peaks and reads
+Variation of library-size normalized counts of consensus peaks
+```{r PCA2, fig.height=5,fig.width=5}
+try(dba.plotPCA(DBdataCounts,DBA_CONDITION),silent=TRUE)
+```
+
+# Set Up Contrast
+Contrast is Group1 - Group2.
+```{r contrast}
+DBdatacontrast <- dba.contrast(DBdataCounts, minMembers=2, categories = DBA_CONDITION)
+print(DBdatacontrast)
+```
+
+# Differential Analysis
+This report shows the differential analysis with two tools: Deseq2 and EdgeR. For most
+projects, Deseq2 is the optimal tool. Both tools assume that the majority of peaks are
+not changing between the two conditions. EdgeR also assumes that there are equal numbers
+of peaks on each side of the contrast, so it normalizes the data more than Deseq2. EdgeR
+is especially useful when this assumption is true or when there are large differences in
+library size across samples. All concentrations are on log2 scale.
+
+```{r analyze}
+DBAnalysisDeseq2 <- dba.analyze(DBdatacontrast, method = DBA_DESEQ2)
+DBAnalysisEdgeR <- dba.analyze(DBdatacontrast, method = DBA_EDGER)
+```
+
+```{r report}
+DBReportDeseq2 <- dba.report(DBAnalysisDeseq2, method = DBA_DESEQ2)
+DBReportEdgeR <- dba.report(DBAnalysisEdgeR, method = DBA_EDGER)
+```
+
+## PCA {.tabset .tabset-fade}
+Variance of differential peaks only
+
+### DeSeq2 {-}
+```{r PCA3, fig.height=5,fig.width=5}
+try(dba.plotPCA(DBAnalysisDeseq2, contrast=1, method= DBA_DESEQ2),silent=TRUE)
+```
+
+### EdgeR {-}
+```{r PCA4, fig.height=5,fig.width=5}
+try(dba.plotPCA(DBAnalysisEdgeR, contrast=1, method = DBA_EDGER),silent=TRUE)
+```
+
+## MA plot {.tabset .tabset-fade}
+"Log concentration" means average concentration across all samples.
+Each dot is a consensus peak.
+
+### DeSeq2 {-}
+```{r MA_D}
+try(dba.plotMA(DBAnalysisDeseq2, method = DBA_DESEQ2),silent=TRUE)
+```
+
+### EdgeR {-}
+```{r MA_E}
+try(dba.plotMA(DBAnalysisEdgeR, method = DBA_EDGER),silent=TRUE)
+```
+
+## Volcano plot {.tabset .tabset-fade}
+Each dot is a consensus peak.
+
+### DeSeq2 {-}
+```{r Volcano1}
+try(dba.plotVolcano(DBAnalysisDeseq2, method = DBA_DESEQ2),silent=TRUE)
+```
+
+### EdgeR {-}
+```{r Volcano2}
+try(dba.plotVolcano(DBAnalysisEdgeR, method = DBA_EDGER),silent=TRUE)
+```
+
+## Heatmap: Differential {.tabset .tabset-fade}
+1000 most significant differential peaks (Deseq2 or EdgeR normalized)
+
+### DeSeq2 {-}
+```{r heatmap4D}
+try(dba.plotHeatmap(DBAnalysisDeseq2,contrast=1,method = DBA_DESEQ2,correlations=FALSE,margin=20,cexRow=1,cexCol=1),silent=TRUE)
+```
+
+### EdgeR {-}
+```{r heatmap4E}
+try(dba.plotHeatmap(DBAnalysisEdgeR,contrast=1,method = DBA_EDGER,correlations=FALSE,margin=20,cexRow=1,cexCol=1),silent=TRUE)
+```
+
+## Top 500 differentially bound peaks {.tabset .tabset-fade}
+### DeSeq2 {-}
+```{r Deseq2Report}
+outfile <- paste0(contrasts, "-", peakcaller, "_Diffbind_Deseq2.txt")
+outfile2 <- paste0(contrasts, "-", peakcaller, "_Diffbind_Deseq2.bed")
+DBReportDeseq2$name <- paste0("Peak",1:length(DBReportDeseq2))
+try(rtracklayer::export(DBReportDeseq2, outfile2),silent=TRUE)
+write.table(DBReportDeseq2, outfile, quote=F, sep="\t", row.names=F)
+D2i <- length(DBReportDeseq2)
+if (D2i == 0) {
+   i=1
+} else if (D2i > 500) {
+   i=500
+} else {
+   i=D2i
+}
+try(DT::datatable(data.frame(DBReportDeseq2)[1:i,], rownames=F),silent=TRUE)
+
+report2 <- dba.report(DBAnalysisDeseq2,method = DBA_DESEQ2,th=100,bNormalized=T,bFlip=FALSE,precision=0,bCalled=T)
+outfile3 <- paste0(contrasts, "-", peakcaller, "_Diffbind_Deseq2_fullList.txt")
+write.table(report2, outfile3, quote=F, sep="\t", row.names=F)
+```
+
+### EdgeR {-}
+```{r EdgeRReport}
+outfile <- paste0(contrasts, "-", peakcaller,"_Diffbind_EdgeR.txt")
+outfile2 <- paste0(contrasts, "-", peakcaller,"_Diffbind_EdgeR.bed")
+DBReportEdgeR$name <- paste0("Peak",1:length(DBReportEdgeR))
+try(rtracklayer::export(DBReportEdgeR, outfile2),silent=TRUE)
+write.table(DBReportEdgeR, outfile, quote=F, sep="\t", row.names=F)
+Ei <- length(DBReportEdgeR)
+if (Ei == 0) {
+   i=1
+} else if (Ei > 500) {
+   i=500
+} else {
+   i=Ei
+}
+try(DT::datatable(data.frame(DBReportEdgeR)[1:i,], rownames=F),silent=TRUE)
+
+report2 <- dba.report(DBAnalysisEdgeR,method = DBA_EDGER,th=100,bNormalized=T,bFlip=FALSE,precision=0,bCalled=T)
+outfile3 <- paste0(contrasts, "-", peakcaller, "_Diffbind_EdgeR_fullList.txt")
+write.table(report2, outfile3, quote=F, sep="\t", row.names=F)
+```
+
+## R tool version information
+```{r Info}
+sessionInfo()
+```
+
+<div class="tocify-extend-page" data-unique="tocify-extend-page" style="height: 0;"></div>
diff --git a/bin/DiffBind_v2_ChIPseq_block.Rmd b/bin/DiffBind_v2_ChIPseq_block.Rmd
new file mode 100644
index 0000000..2a508b5
--- /dev/null
+++ b/bin/DiffBind_v2_ChIPseq_block.Rmd
@@ -0,0 +1,267 @@
+---
+title: "DiffBind: ChIP-seq pipeline, paired/blocked analysis"
+output: 
+    html_document:
+        toc: true
+        toc_float:
+           collapsed: false
+        number_sections: true
+        toc_depth: 3
+        fig_width: 7
+        fig_height: 6
+params:
+    csvfile: samplesheet.csv
+    contrasts: "group1_vs_group2"
+    peakcaller: "macs"
+    dir: "/path/to/DiffBindBlock/directory"
+---
+
+<style type="text/css">
+  body{
+  font-size: 12pt;
+}
+</style>
+
+```{r, include=FALSE, warning=FALSE, message=FALSE}
+## grab args
+dateandtime<-format(Sys.time(), "%a %b %d %Y - %X")
+
+csvfile <- params$csvfile
+contrasts <- params$contrasts
+peakcaller <- params$peakcaller
+```
+
+**Groups being compared:**
+    *`r contrasts`*  
+**Peak sources:**
+    *`r peakcaller`*  
+**Report generated:** 
+    *`r dateandtime`*  
+
+```{r setup, echo=FALSE, warning=FALSE,message=FALSE}
+knitr::opts_chunk$set(echo = FALSE, include=TRUE, message=FALSE, warning=FALSE, error=FALSE)
+knitr::opts_knit$set(root.dir=params$dir)
+suppressMessages(library(DT))
+suppressMessages(library(DiffBind))
+suppressMessages(library(parallel))
+```
+
+# Peak Data
+Read in sample sheet information and peak information
+```{r samples} 
+samples <- dba(sampleSheet=csvfile)
+consensus <- dba.peakset(samples,consensus=DBA_CONDITION)
+print(samples)
+```
+
+## Correlation heatmap: Only peaks
+Pearson correlation of peak positions: all samples versus all samples  
+```{r heatmap1}
+try(dba.plotHeatmap(samples,main="",cexRow=1,cexCol=1),silent=TRUE)
+```
+
+## PCA: Only peaks
+Variance of peak positions  
+```{r PCA1, fig.height=5,fig.width=5}
+try(dba.plotPCA(samples,DBA_CONDITION),silent=TRUE)
+```
+
+## Overlapping peak counts
+Number of overlapping peaks.  
+If the number of samples is greater than 4, a "consensus" peak Venn diagram is created, where
+the consensus peak set are the peaks identified in at least 2 samples for that condition. This is different
+from the consensus peak set used for differential analyses.
+```{r Venn, fig_height=4}
+if (nrow(samples$samples) < 5) {
+   dba.plotVenn(samples,1:nrow(samples$samples)) 
+} else {
+   dba.plotVenn(consensus,consensus$masks$Consensus,main="Binding Site Overlaps: 'consensus', comparing between groups")
+   try(dba.plotVenn(samples,samples$masks[[3]],main="Binding Site Overlaps: samples in Group1"),silent=TRUE)
+   try(dba.plotVenn(samples,samples$masks[[4]],main="Binding Site Overlaps: samples in Group2"),silent=TRUE)
+}
+```
+
+# Consensus peaks and counts
+Consensus peaks are peaks found in at least two samples, independent of condition.
+FRiP is of consensus peaks and will not match FRiP values calculated outside of this tool.
+```{r peaksORsummits}
+if ( grepl("narrow",samples$samples$Peaks[1]) ) {
+   summits <- TRUE
+   print ("Narrow peak calling tool.")
+   print ("Differential peaks are 250bp upstream and downstream of the summits.")
+} else if ( grepl("broad",samples$samples$Peaks[1]) ) {
+  summits <- FALSE
+  print ("Broad peak calling tool.")
+  print ("Differential peaks are consensus peaks.")
+} else {
+  summits <- FALSE
+  print ("Indeterminate peak calling tool.")
+  print ("Differential peaks are consensus peaks.")
+}
+```
+
+```{r DBcount}
+if (summits == TRUE) {
+	DBdataCounts <- dba.count(samples, summits=250)
+} else {
+	DBdataCounts <- dba.count(samples)
+}
+print(DBdataCounts)
+outfile2 <- paste0(contrasts, "-", peakcaller,"_Diffbind_consensusPeaks.bed")
+consensus2 <- dba.peakset(DBdataCounts,bRetrieve=T)
+consensus2$name <- paste0("Peak",1:length(consensus2))
+#rtracklayer::export(consensus2,outfile2)
+```
+
+## Correlation heatmap: Peaks and reads
+Pearson correlation of library-size normalized counts of consensus peaks: all samples versus all samples
+```{r heatmap2}
+try(dba.plotHeatmap(DBdataCounts,main="",cexRow=1,cexCol=1),silent=TRUE)
+```
+
+## Heatmap: Average signal across each peak
+1000 most variable consensus peaks (library-size normalized counts)
+```{r heatmap3}
+try(dba.plotHeatmap(DBdataCounts,correlations=FALSE,cexRow=1,cexCol=1),silent=TRUE)
+```
+
+## PCA: Peaks and reads
+Variation of library-size normalized counts of consensus peaks
+```{r PCA2, fig.height=5,fig.width=5}
+try(dba.plotPCA(DBdataCounts,DBA_CONDITION),silent=TRUE)
+```
+
+# Set Up Contrast
+Contrast is Group1 - Group2.
+```{r contrast}
+DBdatacontrast <- dba.contrast(DBdataCounts, minMembers=2, categories = DBA_CONDITION,
+							   block=DBA_TREATMENT)
+print(DBdatacontrast)
+```
+
+# Differential Analysis
+This report shows the differential analysis with two tools: Deseq2 and EdgeR. For most
+projects, Deseq2 is the optimal tool. Both tools assume that the majority of peaks are
+not changing between the two conditions. EdgeR also assumes that there are equal numbers
+of peaks on each side of the contrast, so it normalizes the data more than Deseq2. EdgeR
+is especially useful when this assumption is true or when there are large differences in
+library size across samples. All concentrations are on log2 scale.
+
+```{r analyze}
+DBAnalysisDeseq2 <- dba.analyze(DBdatacontrast, method = DBA_DESEQ2)
+DBAnalysisEdgeR <- dba.analyze(DBdatacontrast, method = DBA_EDGER)
+```
+
+```{r report}
+DBReportDeseq2 <- dba.report(DBAnalysisDeseq2, method = DBA_DESEQ2_BLOCK)
+DBReportEdgeR <- dba.report(DBAnalysisEdgeR, method = DBA_EDGER_BLOCK)
+```
+
+## PCA {.tabset .tabset-fade}
+Variance of differential peaks only
+
+### DeSeq2 {-}
+```{r PCA3, fig.height=5,fig.width=5}
+try(dba.plotPCA(DBAnalysisDeseq2, contrast=1, method= DBA_DESEQ2_BLOCK),silent=TRUE)
+```
+
+### EdgeR {-}
+```{r PCA4, fig.height=5,fig.width=5}
+try(dba.plotPCA(DBAnalysisEdgeR, contrast=1, method = DBA_EDGER_BLOCK),silent=TRUE)
+```
+
+## MA plot {.tabset .tabset-fade}
+"Log concentration" means average concentration across all samples.
+Each dot is a consensus peak.
+
+### DeSeq2 {-}
+```{r MA_D}
+try(dba.plotMA(DBAnalysisDeseq2, method = DBA_DESEQ2_BLOCK),silent=TRUE)
+```
+
+### EdgeR {-}
+```{r MA_E}
+try(dba.plotMA(DBAnalysisEdgeR, method = DBA_EDGER_BLOCK),silent=TRUE)
+```
+
+## Volcano plot {.tabset .tabset-fade}
+Each dot is a consensus peak.
+
+### DeSeq2 {-}
+```{r Volcano1}
+try(dba.plotVolcano(DBAnalysisDeseq2, method = DBA_DESEQ2_BLOCK),silent=TRUE)
+```
+
+### EdgeR {-}
+```{r Volcano2}
+try(dba.plotVolcano(DBAnalysisEdgeR, method = DBA_EDGER_BLOCK),silent=TRUE)
+```
+
+## Heatmap: Differential {.tabset .tabset-fade}
+1000 most significant differential peaks (Deseq2 or EdgeR normalized)
+
+### DeSeq2 {-}
+```{r heatmap4D}
+try(dba.plotHeatmap(DBAnalysisDeseq2,contrast=1,method = DBA_DESEQ2_BLOCK,
+                    correlations=FALSE,margin=20,cexRow=1,cexCol=1),silent=TRUE)
+```
+
+### EdgeR {-}
+```{r heatmap4E}
+try(dba.plotHeatmap(DBAnalysisEdgeR,contrast=1,method = DBA_EDGER_BLOCK,
+                    correlations=FALSE,margin=20,cexRow=1,cexCol=1),silent=TRUE)
+```
+
+## Top 500 differentially bound peaks {.tabset .tabset-fade}
+### DeSeq2 {-}
+```{r Deseq2Report}
+outfile <- paste0(contrasts, "-", peakcaller, "_Diffbind_Deseq2_block.txt")
+outfile2 <- paste0(contrasts, "-", peakcaller, "_Diffbind_Deseq2_block.bed")
+DBReportDeseq2$name <- paste0("Peak",1:length(DBReportDeseq2))
+try(rtracklayer::export(DBReportDeseq2, outfile2),silent=TRUE)
+write.table(DBReportDeseq2, outfile, quote=F, sep="\t", row.names=F)
+D2i <- length(DBReportDeseq2)
+if (D2i == 0) {
+   i=1
+} else if (D2i > 500) {
+   i=500
+} else {
+   i=D2i
+}
+try(DT::datatable(data.frame(DBReportDeseq2)[1:i,], rownames=F),silent=TRUE)
+
+report2 <- dba.report(DBAnalysisDeseq2,method = DBA_DESEQ2_BLOCK,
+                      th=100,bNormalized=T,bFlip=FALSE,precision=0)
+outfile3 <- paste0(contrasts, "-", peakcaller, "_Diffbind_Deseq2_block_fullList.txt")
+write.table(report2, outfile3, quote=F, sep="\t", row.names=F)
+```
+
+### EdgeR {-}
+```{r EdgeRReport}
+outfile <- paste0(contrasts, "-", peakcaller,"_Diffbind_EdgeR_block.txt")
+outfile2 <- paste0(contrasts, "-", peakcaller,"_Diffbind_EdgeR_block.bed")
+DBReportEdgeR$name <- paste0("Peak",1:length(DBReportEdgeR))
+try(rtracklayer::export(DBReportEdgeR, outfile2),silent=TRUE)
+write.table(DBReportEdgeR, outfile, quote=F, sep="\t", row.names=F)
+Ei <- length(DBReportEdgeR)
+if (Ei == 0) {
+   i=1
+} else if (Ei > 500) {
+   i=500
+} else {
+   i=Ei
+}
+try(DT::datatable(data.frame(DBReportEdgeR)[1:i,], rownames=F),silent=TRUE)
+
+report2 <- dba.report(DBAnalysisEdgeR,method = DBA_EDGER_BLOCK,
+                      th=100,bNormalized=T,bFlip=FALSE,precision=0)
+outfile3 <- paste0(contrasts, "-", peakcaller, "_Diffbind_EdgeR_block_fullList.txt")
+write.table(report2, outfile3, quote=F, sep="\t", row.names=F)
+```
+
+## R tool version information
+```{r Info}
+sessionInfo()
+```
+
+<div class="tocify-extend-page" data-unique="tocify-extend-page" style="height: 0;"></div>
\ No newline at end of file
diff --git a/bin/DiffBind_v2_cfChIP_QC.Rmd b/bin/DiffBind_v2_cfChIP_QC.Rmd
new file mode 100644
index 0000000..d058cec
--- /dev/null
+++ b/bin/DiffBind_v2_cfChIP_QC.Rmd
@@ -0,0 +1,204 @@
+---
+title: "DiffBind: cfChIP-seq QC"
+output: 
+    html_document:
+        toc: true
+        toc_float:
+           collapsed: false
+        number_sections: true
+        toc_depth: 3
+        fig_width: 7
+        fig_height: 6
+params:
+    csvfile: samplesheet.csv
+    contrasts: "group1_vs_group2"
+    peakcaller: "macs"
+---
+
+<style type="text/css">
+  body{
+  font-size: 12pt;
+}
+</style>
+
+```{r, include=FALSE, warning=FALSE, message=FALSE}
+## grab args
+dateandtime<-format(Sys.time(), "%a %b %d %Y - %X")
+
+csvfile <- params$csvfile
+contrasts <- params$contrasts
+peakcaller <- params$peakcaller
+```
+
+**Peak sources:**
+    *`r peakcaller`*  
+**Report generated:** 
+    *`r dateandtime`*  
+
+```{r setup, echo=FALSE, warning=FALSE,message=FALSE}
+knitr::opts_chunk$set(echo = FALSE, include=TRUE, message=FALSE, warning=FALSE, error=FALSE)
+suppressMessages(library(DiffBind))
+suppressMessages(library(parallel))
+suppressMessages(library(dplyr))
+suppressMessages(library(tidyr))
+suppressMessages(library(umap))
+suppressMessages(library(ggplot2))
+suppressMessages(library(ggrepel))
+```
+
+# Peak Data
+Read in sample sheet information and peak information
+```{r samples} 
+samples <- dba(sampleSheet=csvfile)
+
+# if samples have Condition values
+if ( sum(samples$class["Condition",] != "") == ncol(samples$class) ) {
+  consensus <- dba.peakset(samples,consensus=DBA_CONDITION, minOverlap = min(table(samples$samples$Condition)))
+}
+print(samples)
+```
+
+## Correlation heatmap: Only peaks
+Pearson correlation of peak positions: all samples versus all samples  
+```{r heatmap1}
+try(dba.plotHeatmap(samples,main="",cexRow=1,cexCol=1),silent=TRUE)
+```
+
+## PCA: Only peaks
+Variance of peak positions  
+```{r PCA1, fig.height=5,fig.width=5}
+try(dba.plotPCA(samples),silent=TRUE)
+```
+
+## Overlapping peak counts
+Number of overlapping peaks.  
+If the number of samples is greater than 4, a "consensus" peak Venn diagram is created, where
+the consensus peak set are the peaks identified in at least 2 samples for that condition. This is different
+from the consensus peak set used for differential analyses.
+```{r Venn, fig_height=4}
+if (nrow(samples$samples) < 5) {
+   dba.plotVenn(samples,1:nrow(samples$samples)) 
+} else {
+   if ( sum(samples$class["Condition",] != "") == ncol(samples$class) ) {
+      dba.plotVenn(consensus,consensus$masks$Consensus,main="Binding Site Overlaps: 'consensus', comparing between groups")
+   } else {
+      print("Consensus peaks were not called")
+   }
+}
+```
+
+# Consensus peaks and counts
+Consensus peaks are peaks found in at least two samples, independent of condition.
+FRiP is of consensus peaks and will not match FRiP values calculated outside of this tool.
+```{r peaksORsummits}
+if ( grepl("narrow",samples$samples$Peaks[1]) ) {
+   summits <- TRUE
+   print ("Narrow peak calling tool.")
+   print ("Differential peaks are 250bp upstream and downstream of the summits.")
+} else if ( grepl("broad",samples$samples$Peaks[1]) ) {
+  summits <- FALSE
+  print ("Broad peak calling tool.")
+  print ("Differential peaks are consensus peaks.")
+} else {
+  summits <- FALSE
+  print ("Indeterminate peak calling tool.")
+  print ("Differential peaks are consensus peaks.")
+}
+```
+
+```{r DBcount}
+
+if ( sum(samples$class["Condition",] != "") == ncol(samples$class) ) {
+  minOv <- min(table(samples$samples$Condition))
+} else {
+  minOv <- floor(ncol(samples$class)/3)
+}
+
+print(paste0("The minimum number of overlaps is: ", minOv))
+
+if (summits == TRUE) {
+  DBdataCounts <- dba.count(samples, summits=250, minOverlap = minOv)
+} else {
+  DBdataCounts <- dba.count(samples, minOverlap = minOv)
+}
+print(DBdataCounts)
+
+```
+
+## Correlation heatmap: Peaks and reads
+Pearson correlation of library-size normalized counts of consensus peaks: all samples versus all samples
+```{r heatmap2}
+try(dba.plotHeatmap(DBdataCounts,main="",cexRow=1,cexCol=1),silent=TRUE)
+```
+
+## Heatmap: Average signal across each peak
+1000 most variable consensus peaks (library-size normalized counts)
+```{r heatmap3}
+try(dba.plotHeatmap(DBdataCounts,correlations=FALSE,cexRow=1,cexCol=1),silent=TRUE)
+```
+
+## PCA: Peaks and reads
+Variation of library-size normalized counts of consensus peaks
+```{r PCA2, fig.height=5,fig.width=5}
+try(dba.plotPCA(DBdataCounts),silent=TRUE)
+```
+
+```{r TMM}
+vec <- c("seqnames", "start", "end", "width", "strand", samples$samples$SampleID)
+consensus2 <- dba.peakset(DBdataCounts, bRetrieve=TRUE) %>% ##extracts TMM-normalized  counts
+  as.data.frame() %>% setNames(vec) %>% arrange(start, end) %>% mutate(Peaks = paste0("Peak",1:nrow(.))) %>% 
+  dplyr::select(1:4, Peaks, samples$samples$SampleID)
+
+outfile1 <- paste0(contrasts, "-", peakcaller, "_DiffBindQC_TMMcounts.csv")
+write.csv(consensus2, outfile1, row.names = F)
+
+outfile2 <- paste0(contrasts, "-", peakcaller, "_DiffBindQC_TMMcounts.bed")
+write.table(consensus2[,c("seqnames","start","end","Peaks")],
+    outfile2, quote=F, sep="\t", row.names=F, col.names=F)
+
+counts_TMM_ALL <- consensus2
+rownames(counts_TMM_ALL) <- counts_TMM_ALL$Peaks
+counts_TMM_ALL$Peaks <- NULL
+
+counts_TMM_ALL <- counts_TMM_ALL %>% dplyr::select(5:ncol(.)) %>%
+  t() %>% log10() %>% as.data.frame(.)
+##UMAP coordinates
+set.seed(123)
+if (nrow(samples$samples) < 16) {
+  umap_coord <- umap(counts_TMM_ALL, n_neighbors= nrow(samples$samples)-1)
+} else {
+  umap_coord <- umap(counts_TMM_ALL)
+}
+umap_coord <-as.data.frame(umap_coord$layout) %>% setNames(c("UMAP1", "UMAP2"))
+
+outfile <- paste0(contrasts, "-", peakcaller, "_DiffBindQC_UMAP.csv")
+write.csv(umap_coord, outfile, row.names = F)
+```
+
+## UMAP: peaks and reads 
+```{r UMAP_plot}
+p <- ggplot(umap_coord,aes(x = UMAP1, y = UMAP2, label = samples$samples$SampleID))+ ##With labels
+  geom_point(aes(color=samples$samples$Condition), size = 3) +
+  theme_bw()+ ggtitle(paste0("log-transformed counts:", "n = ", nrow(umap_coord))) +
+  theme(plot.title = element_text(hjust = 0.5)) +
+  labs(color = "Phenotypes") + theme(text=element_text(size=15))+
+  geom_text_repel(point.size = NA, size = 2.5)
+q <- ggplot(umap_coord,aes(x = UMAP1, y = UMAP2)) + ##No labels
+  geom_point(aes(color=samples$samples$Condition), size = 3) +
+  theme_bw()+ ggtitle(paste0("log-transformed counts:", "n = ", nrow(umap_coord))) +
+  theme(plot.title = element_text(hjust = 0.5)) +
+  labs(color ="Phenotypes") +  theme(text=element_text(size=15))
+  ##geom_text_repel(point.size = NA, size = 2.5)
+p
+
+if ( sum(samples$class["Condition",] != "") == ncol(samples$class) ) {
+q
+}
+```
+
+## R tool version information
+```{r Info}
+sessionInfo()
+```
+
+<div class="tocify-extend-page" data-unique="tocify-extend-page" style="height: 0;"></div>
diff --git a/bin/FRiP_plot.R b/bin/FRiP_plot.R
new file mode 100644
index 0000000..8a81f49
--- /dev/null
+++ b/bin/FRiP_plot.R
@@ -0,0 +1,112 @@
+## FRIP_plot.R
+## Created by Tovah Markowitz
+## June 19, 2020
+## Updated: Jan 19, 2022
+## Updated: Novemeber 4, 2022
+
+args <- commandArgs(trailingOnly = TRUE)
+folder <- args[1]
+
+library(ggplot2)
+library(rjson)
+
+merge_files <- function(folder) {
+  files <- list.files(path=paste0(folder,"/PeakQC"), pattern="FRiP_table.txt", 
+  	   	full.names=T)
+  allList <- lapply(files,read.table,header=T)
+  allData <- do.call(rbind.data.frame, allList)
+  write.table(allData, paste0(folder, "/PeakQC/FRiP_All_table.txt"), quote=F, 
+  		row.names=F, sep="\t")
+  return(allData)
+}
+
+plot_barplots <- function(inData, groupName, folder) {
+  p <- ggplot(inData,aes(x=bamsample, y=FRiP, fill=bedsample))
+  p <- p + geom_bar(position="dodge",stat = "identity") +
+    facet_wrap(.~bedtool) +
+    theme_bw() +
+    theme(axis.text.x=element_text(angle = -15, hjust = 0)) +
+    labs(title=groupName, x="bam file", y ="Fraction of Reads in Peaks (FRiP)", 
+         fill ="peak file")
+  pdf(paste0(folder, "/PeakQC/", groupName,".FRiP_barplot.pdf"))
+  print(p)
+  dev.off()
+}
+
+plot_scatterplots <- function(inData, groupName, folder) {
+  p <- ggplot(inData,aes(x=n_basesM, y=FRiP, shape=bedsample, color=bedtool))
+  p <- p + geom_point(size=2.5) +
+    facet_wrap(.~bamsample) +
+    theme_bw() + 
+    scale_x_continuous(trans = "log10") +
+    labs(title=groupName, x="Number of Bases in Peaks (M)", 
+         y="Fraction of Reads in Peaks (FRiP)",
+         shape="peak file", color="peak calling tool")
+  q <- p + annotation_logticks(sides="b")
+  pdf(paste0(folder, "/PeakQC/", groupName,".FRiP_scatterplot.pdf"))
+  tryCatch(print(q), error = function(e) {print(p)})
+  dev.off()
+}
+
+plot_barplots_self <- function(inData2, folder) {
+  p <- ggplot(inData2,aes(x=bamsample, y=FRiP, fill=groupInfo))
+  p <- p + geom_bar(position="dodge",stat = "identity") +
+    facet_wrap(.~bedtool) +
+    theme_bw() +
+    theme(axis.text.x=element_text(angle = -15, hjust = 0)) +
+    labs(title="All Samples",x="bam file", y ="Fraction of Reads in Peaks (FRiP)", 
+         fill ="Group")
+  pdf(paste0(folder, "/PeakQC/FRiP_barplot.pdf"))
+  print(p)
+  dev.off()
+}
+
+plot_scatterplots_self <- function(inData2, folder) {
+  p <- ggplot(inData2,aes(x=n_basesM, y=FRiP, shape=bedtool, color=groupInfo))
+  p <- p + geom_point(size=2.5) +
+    theme_bw() + 
+    scale_x_continuous(trans = "log10") +
+    annotation_logticks(sides="b") +
+    labs(title="All samples", x="Number of Bases in Peaks (M)", 
+         y="Fraction of Reads in Peaks (FRiP)",
+         shape="peak file", color="peak calling tool")
+  pdf(paste0(folder, "/PeakQC/FRiP_scatterplot.pdf"))
+  print(p)
+  dev.off()
+}
+
+process_json <- function(injson) {
+# to get the identities of the groups and the list of samples (ChIP and input)
+# associated with it
+  json  <- fromJSON(file = injson)
+  groupsInfo <- json$project$groups
+  inputs <- as.data.frame(json$project$peaks$inputs)
+  for (i in 1:length(groupsInfo)) {
+    tmp <- unique(unlist(inputs[names(inputs) %in% groupsInfo[[i]]]))
+    if (length(tmp) > 1) {
+       groupsInfo[[i]] <- c(groupsInfo[[i]],as.character(tmp))
+    } else if (tmp != "" ) {
+       groupsInfo[[i]] <- c(groupsInfo[[i]],as.character(tmp))
+    }
+  }
+  return(groupsInfo)
+}
+
+allData <- merge_files(folder)
+groupList <- process_json(paste0(folder,"/config.json"))
+
+for (i in 1:length(groupList)) {
+  group <- groupList[[i]]
+  groupName <- names(groupList)[i]
+  inData <- allData[which((allData$bedsample %in% group) & 
+  	    	          (allData$bamsample %in% group)),]
+  plot_barplots(inData, groupName, folder)
+  plot_scatterplots(inData, groupName, folder)
+}
+
+selfData <- allData[which(allData$bedsample == allData$bamsample),]
+groupInfo <- reshape2::melt(groupList)
+names(groupInfo) <- c("bamsample","groupInfo")
+selfData2 <- merge(selfData,groupInfo)
+plot_barplots_self(selfData2, folder)
+plot_scatterplots_self(selfData2, folder)
diff --git a/bin/atac_nrf.py b/bin/atac_nrf.py
new file mode 100644
index 0000000..edf21aa
--- /dev/null
+++ b/bin/atac_nrf.py
@@ -0,0 +1,22 @@
+from __future__ import print_function
+import sys
+
+preseq_log=sys.argv[1]
+
+with open(preseq_log, 'r') as fp:
+        for line in fp:
+            if line.startswith('TOTAL READS'):
+                tot_reads = float(line.strip().split("= ")[1])
+            elif line.startswith('DISTINCT READS'):
+                distinct_reads = float(line.strip().split('= ')[1])
+            elif line.startswith('1\t'):
+                one_pair = float(line.strip().split()[1])
+            elif line.startswith('2\t'):
+                two_pair = float(line.strip().split()[1])
+
+NRF = distinct_reads/tot_reads
+PBC1 = one_pair/distinct_reads
+PBC2 = one_pair/two_pair
+
+print("%.3f\t%.3f\t%.3f"%(NRF,PBC1,PBC2))
+
diff --git a/bin/bam_filter_by_mapq.py b/bin/bam_filter_by_mapq.py
new file mode 100644
index 0000000..12037cd
--- /dev/null
+++ b/bin/bam_filter_by_mapq.py
@@ -0,0 +1,40 @@
+import pysam,sys
+import argparse
+
+parser = argparse.ArgumentParser(description='filter PE bamfile by mapQ values')
+parser.add_argument('-i',dest='inBam',required=True,help='Input Bam File')
+parser.add_argument('-o',dest='outBam',required=True,help='Output Bam File')
+parser.add_argument('-q',dest='mapQ',type=int,required=False,help='mapQ value ... default 6',default=6)
+args = parser.parse_args()
+
+samfile = pysam.AlignmentFile(args.inBam, "rb")
+mapq=dict()
+for read in samfile.fetch():
+        if read.is_unmapped:
+                continue
+        if read.is_supplementary:
+                continue
+        if read.is_secondary:
+                continue
+        if read.is_duplicate:
+                continue
+        if read.is_proper_pair:
+                if read.mapping_quality < args.mapQ and read.query_name in mapq:
+                        del mapq[read.query_name]
+                if read.mapping_quality >= args.mapQ  and not read.query_name in mapq:
+                        mapq[read.query_name]=1
+samfile.close()
+
+samfile = pysam.AlignmentFile(args.inBam, "rb")
+pairedreads = pysam.AlignmentFile(args.outBam, "wb", template=samfile)
+for read in samfile.fetch():
+        if read.query_name in mapq:
+                if read.is_supplementary:
+                        continue
+                if read.is_secondary:
+                        continue
+                if read.is_duplicate:
+                        continue
+                pairedreads.write(read)
+samfile.close()
+pairedreads.close()
diff --git a/bin/cfChIP_signatures.R b/bin/cfChIP_signatures.R
new file mode 100755
index 0000000..778b75a
--- /dev/null
+++ b/bin/cfChIP_signatures.R
@@ -0,0 +1,97 @@
+####################
+#
+# Name: cfChIP_signatures.R
+# Created by: Tovah Markowitz, PhD
+# Bioinformatics (NCBR)/ Integrated Data Sciences Section (IDSS)
+# Research Technologies Branch/DIR/NIAID
+#
+# Created: August 9, 2022
+# 
+####################
+#
+# Purpose: To take the individual cfChIP signature tables, combine them, 
+#          and create the preferred output plot
+#
+# Functions: mergeSignatures and plotSignatures
+#
+# Requires: ggplot2 and ggprism (for plotting)
+#
+# Details: mergeSignatures will take a folder of signatures and combine them 
+#         into one long table. plotSignatures can directly take the the output 
+#         of mergeSignatures, but you can also load the data into R and filter
+#         to only include a subset of samples before running the function. Also,
+#         add a column called "Condition" either to the input txt file or the R
+#         object to group columns in the plot using that additional information.
+#
+# Function1: mergeSignatures(folder, outFile)
+#   folder:  [required] the path to the folder containing the individual signature
+#            files direct from the cfChIP tool
+#   outFile: [required] the name of the output txt file to save the data
+#
+# Function2: plotSignatures(inTXT, outPDF)
+#   inTXT: [required] either the name of the file from mergeSignatures or an
+#          an R object containing the data. Column names must match that of the
+#          output of mergeSignatures, but column order doesn't matter
+#   inPDF: [required] the name of the output pdf file to create
+#
+# Example usage:
+#   source("cfChIP_signatures.R")
+#   mergeSignatures("cfChIPtool/Output/H3K4me3/Signatures/", out.txt)
+#   plotSignatures(out.txt, out.pdf)
+#   plotSignatures(signatureDataFrame, out.pdf)
+# 
+####################
+
+mergeSignatures <- function(folder, outFile) {
+  files <- list.files(folder,full.names = T)
+  sigList <- lapply(files, read.csv)
+  samples <- gsub(".csv","",grep("csv",unlist(strsplit(files,"/")),value=T))
+  for (i in 1:length(samples)) {
+    sigList[[i]] <- data.frame(sigList[[i]],Sample=samples[i])
+  }
+  sigData <- do.call("rbind",sigList)
+  write.table(sigData, outFile, quote=F, sep="\t", row.names=F)
+}
+
+plotSignatures <- function(inTXT, outPDF) {
+  library(ggplot2)
+  library(ggprism)
+
+  if (mode(inTXT) == "character") { # if using a file name
+    sigData <- read.delim(inTXT)
+  } else { # if starting with an object in R
+   sigData <- as.data.frame(inTXT) 
+  }
+  sigData$NormalizedCounts[which(sigData$NormalizedCounts > 3)] <- 3
+  sigData$NormalizedCounts[which(sigData$NormalizedCounts < 0.15)] <- 0.15
+  sigData$qValue[which(sigData$qValue > 300)] <- 300
+  sigData$qValue[which(sigData$qValue < 5)] <- NA
+  names(sigData)[1] <- "cellType"
+
+  cellTypes <- data.frame(cellType=c("Neutrophils","Monocytes","Megakaryocyte",
+                                   "Erythroblast","T-Cells","B-Cells","NK",
+                                   "Vasculary","Adipose","Skin","Sk. Muscle",
+                                   "Brain","Heart","Lung","Breast","Digestive",
+                                   "Pancreas"),
+                        class=c(rep("Blood",7),rep("Global",4),rep("Other",6)) )
+
+  sigData2 <- merge(sigData,cellTypes)
+  sigData2$cellType <- factor(sigData2$cellType,levels=rev(cellTypes$cellType))
+
+  pdf(outPDF)
+  p <- ggplot(data=sigData2,aes(x=Sample,y=cellType,color=NormalizedCounts,size=qValue))
+  p <- p + geom_point() + 
+    scale_size(limits=c(5,300),breaks=c(5,50,100,150,200,250,300),
+               labels=paste0("e-",c(5,50,100,150,200,250,300))) +
+    scale_color_viridis_c(direction = -1, option="A") +
+    theme_prism() +
+    theme(axis.title.y = element_blank(), axis.title.x = element_blank(),
+          axis.text.x = element_text(angle = 45,vjust=0.9,hjust=1))
+  if (sum(names(sigData) == "Condition") == 1) {
+    p <- p + facet_grid(rows=vars(class),cols=vars(Condition),scales="free",space="free")
+  } else {
+    p <- p + facet_grid(rows=vars(class),scales="free",space="free")
+  }
+  print(p)
+  dev.off()
+}
\ No newline at end of file
diff --git a/bin/frip.py b/bin/frip.py
new file mode 100644
index 0000000..113eb62
--- /dev/null
+++ b/bin/frip.py
@@ -0,0 +1,164 @@
+#!/usr/bin/env python3
+
+"""
+Name: frip.py
+Created by: Tovah Markowitz
+Date: 06/18/20
+
+Purpose: To calculate FRiP scores, one bam file and as many bedfiles as wanted as inputs
+Currently only works with python/3.5
+"""
+
+##########################################
+# Modules
+import argparse
+from argparse import RawTextHelpFormatter
+from pybedtools import BedTool
+import pysam
+import pandas as pd
+
+##########################################
+# Functions
+
+def split_infiles(infiles):
+    """ 
+    breaks the infile string with space-delimited file names and 
+    creates a list
+    """
+    infileList = infiles.strip("\'").strip('\"').split(" ")
+    if len(infileList) == 1:
+        infileList = infileList[0].split(";")
+    return(infileList)
+
+def count_reads_in_bed(bam, bedfile, genomefile):
+    """
+    some of this comes directly from the pybedtools site; read in 
+    bed (or bed-like) file, sort it, and then count the number of 
+    reads within the regions
+    """
+    bedinfo = BedTool(bedfile)
+    bedinfo.sort(g=genomefile)
+    return (
+        BedTool(bam).intersect( bedinfo, bed=True, stream=True, )
+        ).count()
+
+def count_reads_in_bam(bam):
+    """ count the number of reads in a given bam file """
+    return( pysam.AlignmentFile(bam).mapped )
+
+def calculate_frip(nreads, noverlaps):
+    """ calculate FRiP score from nreads and noverlaps """
+    return( float(noverlaps) / nreads )
+
+def measure_bedfile_coverage(bedfile, genomefile):
+    """ calculate the number of bases covered by a given bed file """
+    bedinfo = BedTool(bedfile)
+    return( bedinfo.sort(g=genomefile).total_coverage() )
+
+def clip_bamfile_name(bamfile):
+    """ 
+    clip bam file name for table/plotting purposes; assumes file 
+    naming system matches that of Pipeliner
+    """
+    sample = bamfile.split("/")[-1].split(".")[0]
+    condition =  ".".join(bamfile.split("/")[-1].split(".")[1:-1])
+    return( sample, condition )
+
+def clip_bedfile_name(bedfile,filetype):
+    """
+    clip bed file name for table/plotting purposes; assumes file 
+    naming system matches that of Pipeliner
+    """
+    if filetype == "":
+        toolused = bedfile.split("/")[-3]
+        sample = bedfile.split("/")[-2]
+    else:
+        toolused = filetype
+        sample = bedfile.split("/")[-1].split(".")[0].strip("_peaks").strip("_broadpeaks")
+    return( toolused, sample )
+
+def process_files(bamfile, bedfiles, genome, filetypes):
+    """ 
+    this is the main function to take in list of input files and 
+    put out an array containing key file name information, read 
+    counts, and FRiP scores
+    """
+    bedfileL = bedfiles
+    filetypesL = filetypes
+    out = [[ "bedtool", "bedsample", "bamsample", "bamcondition", 
+    "n_reads", "n_overlap_reads", "FRiP", "n_basesM" ]]
+    nreads = count_reads_in_bam(bamfile)
+    (bamsample, condition) = clip_bamfile_name(bamfile)
+    for i in range(len(bedfileL)):
+        bed = bedfileL[i]
+        if len(filetypesL) > 1:
+            filetype = filetypesL[i]
+        else:
+            filetype = filetypesL[0]
+        (bedtool, bedsample) = clip_bedfile_name(bed,filetype)
+        noverlaps = count_reads_in_bed(bamfile, bed, genome)
+        frip = calculate_frip(nreads, noverlaps)
+        nbases = measure_bedfile_coverage(bed, genome) / 1000000
+        out.append( [bedtool, bedsample, bamsample, condition, 
+                nreads, noverlaps, frip, nbases] )
+    out2 = pd.DataFrame(out[1:], columns=out[0])
+    return(out2)
+
+def create_outfile_name(bamfile, outroot):
+    """ uses outroot to create the output file name """
+    (bamsample, condition) = clip_bamfile_name(bamfile)
+    outtable = bamsample + "." + condition + "." + "FRiP_table.txt"
+    if outroot != "":
+        outtable = outroot + "." + outtable
+    return(outtable)
+
+def write_table(out2, outtable):
+    out2.to_csv(outtable,sep='\t',index=False)
+
+
+###############################################
+# Main
+
+def main():
+    desc="""
+This function takes a space-delimited or semi-colon delimited list
+of bed-like files (extensions must be recognizable by bedtools)
+and a single bam file. It will then calculate the FRiP score for
+all possible combinations of files and save the information in a
+txt file. It will also calculate the number of bases covered by 
+each bed-like file. Note: this function assumes that the file 
+naming system of the input files matches that of Pipeliner.
+    """
+
+    parser = argparse.ArgumentParser(description=desc, formatter_class=RawTextHelpFormatter)
+    parser.add_argument('-p', nargs = '+', required=True, type=str, help='A space- or semicolon-delimited list of peakfiles \
+(or bed-like files).')
+    parser.add_argument('-b', required=True, type=str, help='The name of a bamfile to analyze.')
+    parser.add_argument('-g', required=True, type=str, help='The name of the .genome file so bedtools knows the \
+size of every chromosome.')
+    parser.add_argument('-o', required=True, type=str, help='The root name of the multiple output files. Default:""')
+    parser.add_argument('-t', required=False, default=[""], type=list, help='A space- \
+or semicolon-delimited list of input file sources/types. Only needed when \
+source of bed file is not built into the script. Default: ""')
+
+    args = parser.parse_args()
+    bedfiles = args.p
+    bamfile = args.b
+    genomefile = args.g
+    outroot = args.o
+    filetypes = args.t
+
+    out2 = process_files(bamfile, bedfiles, genomefile, filetypes)
+    outtable = create_outfile_name(bamfile, outroot)
+    write_table(out2, outtable)
+
+if __name__ == '__main__':
+    main()
+
+###############################################
+# example cases
+
+#bedfiles = "macs_broad/mWT_HCF1_mm_i81/mWT_HCF1_mm_i81_peaks.broadPeak macs_broad/mWT_HCF1_mm_i89/mWT_HCF1_mm_i89_peaks.broadPeak"
+#bamfiles = "bam/Input_mm_i95.sorted.Q5DD.bam bam/mWT_HCF1_mm_i81.sorted.Q5DD.bam bam/mWT_HCF1_mm_i89.sorted.Q5DD.bam"
+#genomefile = "/data/CCBR_Pipeliner/db/PipeDB/Indices/mm10_basic/indexes/mm10.fa.sizes"
+#out2 = pd.read_csv("FRIP_test.txt",sep="\t")
diff --git a/bin/jaccard_score.py b/bin/jaccard_score.py
new file mode 100644
index 0000000..378ee69
--- /dev/null
+++ b/bin/jaccard_score.py
@@ -0,0 +1,202 @@
+#!/usr/bin/env python3
+
+"""
+Name: jaccard_score.py
+Created by: Tovah Markowitz
+Date: 1/23/19
+Updated: 8/5/19 to compare multiple tools and create plots
+
+Purpose: To do all pairwise comparisons of bed/peak files given. Uses bedtools
+to calculate a jaccard score for every comparison. All data is saved in a 
+single tab-delimited file.
+"""
+
+##########################################
+# Modules
+import optparse
+from pybedtools import BedTool
+import pandas as pd
+from sklearn.decomposition import PCA as sklearnPCA
+import matplotlib as mpl
+mpl.use('Agg')
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+##########################################
+# Functions
+
+def split_infiles(infiles):
+    """ breaks the infile string with space-delimited file names and creates a list.
+    also works for infile types
+    """
+    infileList = infiles.strip("\'").strip('\"').split(" ")
+    if len(infileList) == 1:
+        infileList = infileList[0].split(";")
+    return(infileList)
+
+def loop_jaccard(infileList, genomefile, filetypeList):
+    """ Uses two loops to do all possible pairwise comparisons of files 
+    in a list. Returns a writeable output and a pandas object
+    """
+    nfiles = len(infileList)
+    (colnames, snames) = get_colnames(infileList, filetypeList)
+    out = [[1.000] * nfiles for i in range(nfiles)]
+    outTable = []
+    for z in range(nfiles):
+        fileA = infileList[z]
+        print("fileA is: " + fileA) 
+        for y in range(z+1,nfiles):
+            fileB = infileList[y]
+            (data, keylist) = run_jaccard(fileA, fileB, genomefile)
+            out[z][y] = data[3]
+            out[y][z] = data[3]
+            if filetypeList != [""]:
+                keylist.insert(1, "toolA")
+                keylist.insert(3, "toolB")
+                data.insert(1, filetypeList[z])
+                data.insert(3, filetypeList[y])
+            if len(outTable) == 0:
+                outTable.append( "\t".join(keylist) )
+            outTable.append( "\t".join(data) )
+        out2 = pd.DataFrame(out, columns=colnames, index=colnames,dtype="float")
+    return(outTable, out2, snames)
+
+def run_jaccard(fileA, fileB, genomefile):
+    """ Running bedtools. Reads in two bedtools approved file types, sorts the files, 
+    and calculates a jaccard score.
+    """
+    a = BedTool(fileA)
+    a = a.sort(g=genomefile)
+    b = BedTool(fileB)
+    b = b.sort(g=genomefile)
+    j = a.jaccard(b,g=genomefile)
+    j["fileA"] = fileA.split("/")[-1]
+    j["fileB"] = fileB.split("/")[-1]
+    keylist = list(j.keys())
+    keylist.sort()
+    data = [ str(j[key]) for key in keylist ]
+    return(data, keylist)
+
+def get_colnames(infileList, filetypeList):
+    snames = [ i.split("/")[-1].split(".")[0].strip("_peaks").strip("_broadpeaks") for i in infileList ]
+    if filetypeList == [""]:
+        colnames = snames
+    else:
+        colnames = [ snames[i] + "_" + filetypeList[i] for i in range(len(snames)) ]
+    return(colnames, snames)
+
+def create_outfile_names(outroot):
+    """ uses outroot to create the output file names """
+    outTableFile = "jaccard.txt"
+    outPCAFile = "jaccard_PCA.pdf"
+    outHeatmapFile = "jaccard_heatmap.pdf"
+    if outroot != "":
+        if outroot[-1] == "/":
+            outTableFile= outroot + outTableFile
+            outPCAFile = outroot + outPCAFile
+            outHeatmapFile = outroot + outHeatmapFile
+        else:
+            outTableFile= outroot + "_" + outTableFile
+            outPCAFile = outroot + "." + outPCAFile
+            outHeatmapFile = outroot + "." + outHeatmapFile
+    return(outTableFile, outPCAFile, outHeatmapFile)
+
+def pca_plot(out, filetypeList, snames, outPCAFile):
+    """ creates a 2D PCA plot comparing the files based upon jaccard scores
+    """
+    sklearn_pca = sklearnPCA(n_components=2)
+    Y_sklearn = sklearn_pca.fit_transform(out)
+    PCAdata = pd.DataFrame(Y_sklearn,columns=["PC1","PC2"])
+    PCAdata.insert(0,"sample name",snames)
+    fig, ax =plt.subplots()
+    snames_pal = sns.hls_palette(len(set(snames)),s=.8)
+    sns.set_palette(snames_pal)
+    if filetypeList != [""]:
+        PCAdata.insert(1,"tool",filetypeList)
+        ax = sns.scatterplot(x="PC1",y="PC2",hue="sample name",style="tool",data=PCAdata,s=100)
+    else:
+        ax = sns.scatterplot(x="PC1",y="PC2",hue="sample name",data=PCAdata,s=100)
+    ax.axhline(y=0, color='grey', linewidth=1,linestyle="--")
+    ax.axvline(x=0, color='grey', linewidth=1,linestyle="--")
+    ax.set(xlabel= "PC1 (" + str(round(100*sklearn_pca.explained_variance_[0],2)) + "%)",
+           ylabel= "PC2 (" + str(round(100*sklearn_pca.explained_variance_[1],2)) + "%)")
+    plt.legend(bbox_to_anchor=(1.05, 1), loc=2)
+    #plt.show()
+    plt.savefig(outPCAFile, bbox_inches='tight')
+    plt.close("all")
+
+def plot_heatmap(out, outHeatmapFile, snames, filetypeList):
+    snames_pal = sns.hls_palette(len(set(snames)),s=.8)
+    snames_lut = dict(zip(set(snames), snames_pal))
+    snames_cols = pd.Series(snames,index=out.index).map(snames_lut)
+    if filetypeList != [""]:
+       tool_pal = sns.cubehelix_palette(len(set(filetypeList)))
+       tool_lut = dict(zip(set(filetypeList), tool_pal))
+       tool_cols = pd.Series(filetypeList,index=out.index).map(tool_lut)
+       g = sns.clustermap(out,cmap="YlGnBu",col_cluster=False,
+                    row_colors=[snames_cols,tool_cols])
+       for label in set(snames):
+            g.ax_col_dendrogram.bar(0, 0, color=snames_lut[label],
+                            label=label, linewidth=0)
+       for label in set(filetypeList):
+            g.ax_col_dendrogram.bar(0, 0, color=tool_lut[label],
+                            label=label, linewidth=0)
+       g.ax_col_dendrogram.legend(loc="center", ncol=3, 
+                                bbox_to_anchor=(0.4, 0.8))
+    else:
+       g = sns.clustermap(out,cmap="YlGnBu",col_cluster=False,
+                    row_colors=snames_cols)
+       for label in set(snames):
+            g.ax_col_dendrogram.bar(0, 0, color=snames_lut[label],
+                            label=label, linewidth=0)
+       g.ax_col_dendrogram.legend(loc="center", ncol=3, 
+                                bbox_to_anchor=(0.5, 0.8))
+    #plt.show()
+    plt.savefig(outHeatmapFile, bbox_inches='tight')
+    plt.close("all")
+
+def write_out(out, outFile):
+    f = open(outFile, 'w')
+    f.write( "\n".join(out) )
+    f.close()
+
+##########################################
+# Main
+
+def main():
+    desc="""
+    This function takes a space-delimited list of files (bed, bedgraph, gff, gtf, etc.)
+    and calculates all possible pairwise jaccard scores. From bedtools: 'Jaccard is the 
+    length of the intersection over the union. Values range from 0 (no intersection) to 
+    1 (self intersection)'. The columns of the output file are: fileA, fileB, 
+    intersection, jaccard, n_intersections, and union-intersection.
+    """
+
+    parser = optparse.OptionParser(description=desc)
+
+    parser.add_option('-i', dest='infiles', default='', help='A space- or semicolon-delimited list of \
+input files for analysis.')
+    parser.add_option('-t', dest='filetypes', default='', help='A space- or semicolon-delimited list \
+of input file sources/types.')
+    parser.add_option('-o', dest='outroot', default='', help='The root name of the output files \
+where all the jaccard score information will be saved.')
+    parser.add_option('-g', dest='genomefile', default='', help='The name of the .genome file.')
+
+    (options,args) = parser.parse_args()
+    infiles = options.infiles
+    filetypes = options.filetypes
+    outroot = options.outroot
+    genomefile = options.genomefile
+
+    infileList = split_infiles(infiles)
+    filetypeList = split_infiles(filetypes)
+    (outTable, out, snames) = loop_jaccard(infileList, genomefile, filetypeList)
+    (outTableFile, outPCAFile, outHeatmapFile) = create_outfile_names(outroot)
+    write_out(outTable, outTableFile)
+    pca_plot(out, filetypeList, snames, outPCAFile)
+    plot_heatmap(out, outHeatmapFile, snames, filetypeList)
+
+if __name__ == '__main__':
+    main()
+
+
diff --git a/bin/ppqt_process.py b/bin/ppqt_process.py
new file mode 100644
index 0000000..9a2c9b1
--- /dev/null
+++ b/bin/ppqt_process.py
@@ -0,0 +1,27 @@
+#!/usr/bin/env python3
+
+#Purpose: To grab the estimated fragment length from the ppqt output and a small txt with that information. For input files, adding an extra value of 200bp as an alternative.
+import argparse
+parser = argparse.ArgumentParser(description='Script to extract the the estimated fragment length from the ppqt output.')
+parser.add_argument('-i', required=True,help='Name of the ppqt txt file')
+parser.add_argument('-o', required=True,help='Name of the output file')
+args = parser.parse_args()
+
+output = args.o
+inppqt = args.i
+
+o=open(output,'w')
+
+file = list(map(lambda z:z.strip().split(),open(inppqt,'r').readlines()))
+
+
+ppqt_values = file[0][2].split(",")
+extenders = []
+for ppqt_value in ppqt_values:
+    if int(ppqt_value) > 150:
+        extenders.append(ppqt_value)
+if len(extenders) > 0:
+    o.write(extenders[0])
+else:
+    o.write("200")      
+o.close()
\ No newline at end of file
diff --git a/bin/prep_diffbind.py b/bin/prep_diffbind.py
new file mode 100644
index 0000000..4ba7d64
--- /dev/null
+++ b/bin/prep_diffbind.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python3
+
+import json
+import argparse
+
+parser = argparse.ArgumentParser(description='Script to prepare the DiffBind input csv')
+parser.add_argument('--g1',dest='group1',required=True,help='Name of the first group')
+parser.add_argument('--g2',dest='group2',required=True,help='Name of the second group')
+parser.add_argument('--wp',dest='workpath',required=True,help='Full path of the working directory')
+parser.add_argument('--pt',dest='peaktool',required=True,help='Name of the the peak calling tool, also the directory where the peak file will be located')
+parser.add_argument('--pe',dest='peakextension',required=True,help='The file extension of the peakcall output')
+parser.add_argument('--pc',dest='peakcaller',required=True,help='Value for the PeakCaller column of the DiffBind csv')
+parser.add_argument('--bd',dest='bamdir',required=True,help='Name of the directory where the bam files are located')
+parser.add_argument('--csv',dest='csvfile',required=True,help='Name of the output csv file')
+
+args = parser.parse_args()
+
+with open("config.json","r") as read_file:
+   config=json.load(read_file)
+   
+chip2input = config['project']['peaks']['inputs']
+groupdata = config['project']['groups']
+blocks = config['project']['blocks']
+
+if None in list(blocks.values()):
+    samplesheet = [",".join(["SampleID","Condition", "Replicate", "bamReads", 
+         "ControlID", "bamControl", "Peaks", "PeakCaller"])]
+else:
+    samplesheet = [",".join(["SampleID","Condition","Treatment","Replicate", "bamReads", 
+         "ControlID", "bamControl", "Peaks", "PeakCaller"])]
+   
+
+for condition in args.group1, args.group2:
+    for chip in groupdata[condition]:
+        replicate = str([ i + 1 for i in range(len(groupdata[condition])) if groupdata[condition][i]== chip ][0])
+        bamReads = args.workpath + "/" + args.bamdir + "/" + chip + ".Q5DD.bam"
+        controlID = chip2input[chip]
+        if controlID != "":
+            bamControl = args.workpath + "/" + args.bamdir + "/" +  controlID + ".Q5DD.bam"
+        else:
+            bamControl = ""
+        peaks = args.workpath + "/" + args.peaktool + "/" + chip + "/" + chip + args.peakextension
+        if None in list(blocks.values()):
+            samplesheet.append(",".join([chip, condition, replicate, bamReads, 
+                   controlID, bamControl, peaks, args.peakcaller]))
+        else:
+            block = blocks[chip]
+            samplesheet.append(",".join([chip, condition, block, replicate, bamReads, 
+                   controlID, bamControl, peaks, args.peakcaller]))
+            
+
+f = open(args.csvfile, 'w')
+f.write ("\n".join(samplesheet))
+f.close()
diff --git a/bin/prep_diffbindQC.py b/bin/prep_diffbindQC.py
new file mode 100644
index 0000000..550b5f9
--- /dev/null
+++ b/bin/prep_diffbindQC.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python3
+
+import json
+import argparse
+
+parser = argparse.ArgumentParser(description='Script to prepare the DiffBind input csv')
+parser.add_argument('--wp',dest='workpath',required=True,help='Full path of the working directory')
+parser.add_argument('--pt',dest='peaktool',required=True,help='Name of the the peak calling tool, also the directory where the peak file will be located')
+parser.add_argument('--pe',dest='peakextension',required=True,help='The file extension of the peakcall output')
+parser.add_argument('--pc',dest='peakcaller',required=True,help='Value for the PeakCaller column of the DiffBind csv')
+parser.add_argument('--bd',dest='bamdir',required=True,help='Name of the directory where the bam files are located')
+parser.add_argument('--csv',dest='csvfile',required=True,help='Name of the output csv file')
+
+args = parser.parse_args()
+
+with open("config.json","r") as read_file:
+   config=json.load(read_file)
+   
+chip2input = config['project']['peaks']['inputs']
+groupdata = config['project']['groups']
+
+tmpIDs = [x for xs in groupdata.values() for x in xs]
+Ncounts = [tmpIDs.count(tmp) for tmp in set(tmpIDs)]
+
+samplesheet = [",".join(["SampleID","Condition", "Replicate", "bamReads", 
+         "ControlID", "bamControl", "Peaks", "PeakCaller"])]
+
+count = 1
+for chip in chip2input.keys():
+   if set(Ncounts) == {1}: # if all samples only in one group
+      for key in groupdata.keys():
+          if chip in groupdata[key]:
+             condition = key
+      replicate = str([ i + 1 for i in range(len(groupdata[condition])) if groupdata[condition][i]== chip ][0])
+   else:
+      condition = ""
+      replicate = str(count)
+      count = count +1
+   bamReads = args.workpath + "/" + args.bamdir + "/" + chip + ".Q5DD.bam"
+   controlID = chip2input[chip]
+   if controlID != "":
+      bamControl = args.workpath + "/" + args.bamdir + "/" +  controlID + ".Q5DD.bam"
+   else:
+      bamControl = ""
+   peaks = args.workpath + "/" + args.peaktool + "/" + chip + "/" + chip + args.peakextension
+   samplesheet.append(",".join([chip, condition, replicate, bamReads, 
+                   controlID, bamControl, peaks, args.peakcaller]))            
+
+f = open(args.csvfile, 'w')
+f.write ("\n".join(samplesheet))
+f.close()
diff --git a/bin/promoterAnnotation_by_Gene.R b/bin/promoterAnnotation_by_Gene.R
new file mode 100755
index 0000000..846cfc0
--- /dev/null
+++ b/bin/promoterAnnotation_by_Gene.R
@@ -0,0 +1,179 @@
+####################
+#
+# Name: promoterAnnotationByGene.R
+# Created by: Tovah Markowitz, PhD
+# Bioinformatics (NCBR)/ Integrated Data Sciences Section (IDSS)
+# Research Technologies Branch/DIR/NIAID
+#
+# Created: August 8, 2022
+# Updated: October 26, 2022 to work with uropa 4.0.2
+# Updated: November 3, 2022 to fit with pipeline
+# 
+####################
+#
+# Purpose: To take UROPA allhits output files using "TSSprot" conditions and
+#          create a table of which genes have annotations overlapping their 
+#          promoters and how many times. Output format: dataframe
+#
+# Details: Promoters will be defined as 3kb upstream to 1 kb downstream of the 
+#          TSS. Allhits files were chosen to capture information from "peaks" 
+#          overlappingmultiple promoters. Finalhits files can also be processed 
+#          with this pipeline. This script can handle multiple allhits files as 
+#          long as there are equal numbers of sampleNames to go with them. Also, 
+#          giving a matching DiffBind txt file will allow the allhits file to be 
+#          filtered to only include the significant differential peaks or to 
+#          split the data by the direction of log fold-change.
+#
+# Requires: GenomicRanges, tidyr
+#
+# Function: promoterAnnotationByGene(allhitsFiles, sampleNames, diffbindFiles=NA, direction=NA)
+# 
+# Variables:
+#     allhitsFiles:  [required] a vector of allhits files to process
+#     sampleNames:   [required] a vector of short names for each allhits file 
+#                               to use as column headers
+#     diffbindFiles: [optional] a vector of diffbind files to use to filter each 
+#                               allhits file
+#     direction:     [optional] when filtering using diffbindFiles, define how to
+#                               filter using log fold change. "Both" is default
+#                               when not defined by user.
+#                               Options: "both", "pos", "neg", "separate"
+#
+# Example usage:
+#     source("promoterAnnotation_by_Gene.R")
+#     out1 <- promoterAnnotationByGene(allhitsA.txt, "A")
+#     out2 <- promoterAnnotationByGene(allhitsA.txt, "A", diffbindA.txt, "both")
+#     out3 <- promoterAnnotationByGene(allhitsFiles= c(allhitsA.txt, allhitsB.txt), 
+#                                      sampleNames=c("A","B"), 
+#                                      diffbindFiles=c(diffbindA.txt,diffbindB.txt), 
+#                                      direction="pos")
+#     out4 <- promoterAnnotationByGene(allhitsFiles= c(allhitsA.txt, allhitsA.txt), 
+#                                      sampleNames=c("Deseq2","EdgeR"), 
+#                                      diffbindFiles=c(Deseq2.txt,EdgeR.txt), 
+#                                      direction="separate")
+# 
+####################
+
+
+allhits2promoter <- function(allhitsFile) {
+  # cleaning up the allhits file to only keep information about peaks 
+  # overlapping promoters
+  inData <- read.delim(allhitsFile)
+  tmp <- which(inData$name == "query_1")
+  if (length(tmp) == 0) {
+    print (paste0("Supplied file ", allhitsFile, " has no peaks overlapping promoters."))
+  } else {
+    promoterData <- inData[tmp,]
+    promoterData <- promoterData[,c("peak_chr", "peak_start", "peak_end", "gene_id", "gene_name")]
+    return(promoterData)
+  }
+}
+
+filterPromoter <- function(Diffbind, promoterData, sampleName) {
+  # used by DiffbindFilterPromoter
+  promoterData2 <- GenomicRanges::makeGRangesFromDataFrame(promoterData, seqnames.field="peak_chr",
+                                  start.field="peak_start", end.field="peak_end",
+                                  starts.in.df.are.0based=F)
+  Diffbind2 <- GenomicRanges::makeGRangesFromDataFrame(Diffbind)
+  ov <- GenomicRanges::countOverlaps(promoterData2,Diffbind2,type = "equal",maxgap=1)
+  promoterData3 <- promoterData[which(ov != 0),]
+  promoterData3$sample_id <- sampleName
+  return(promoterData3)
+}
+
+DiffbindFilterPromoter <- function(DiffbindFile, promoterData, sampleName, direction) {
+  # filters the promoter data based upon whether it matches a different peak and what direction the fold-change is
+  # direction can be: "both", "pos", "neg", "separate". If direction is NA, use "both".
+    Diffbind <- read.delim(DiffbindFile)
+    Diffbind <- Diffbind[which(Diffbind$FDR < 0.05),]
+    if ((direction == "both") | is.na(direction)) {
+      promoterData2 <- filterPromoter(Diffbind, promoterData, sampleName)
+    } else if (direction == "pos") {
+      sampleName <- paste0(sampleName, "_pos")
+      Diffbind <- Diffbind[which(Diffbind$Fold > 0),]
+      promoterData2 <- filterPromoter(Diffbind, promoterData, sampleName)
+    } else if (direction == "neg") {
+      sampleName <- paste0(sampleName, "_neg")
+      Diffbind <- Diffbind[which(Diffbind$Fold < 0),]
+      promoterData2 <- filterPromoter(Diffbind, promoterData, sampleName)
+    } else {
+      sampleNameP <- paste0(sampleName, "_pos")
+      DiffbindP <- Diffbind[which(Diffbind$Fold > 0),]
+      promoterDataP <- filterPromoter(DiffbindP, promoterData, sampleNameP)
+      sampleNameN <- paste0(sampleName, "_neg")
+      DiffbindN <- Diffbind[which(Diffbind$Fold < 0),]
+      promoterDataN <- filterPromoter(DiffbindN, promoterData, sampleNameN)
+      promoterData2 <- rbind(promoterDataP, promoterDataN)
+    }
+return(promoterData2)
+}
+
+createPromoterTable <- function(promoterData) {
+  # making final output table
+  PromoterTable <- data.frame( table(promoterData[,c("gene_id", "sample_id")] ) )
+  PromoterTable2 <- merge( unique(promoterData[,c("gene_id", "gene_name")] ), PromoterTable)
+  PromoterTable3 <- tidyr::pivot_wider(PromoterTable2, names_from="sample_id", values_from="Freq")
+  return(PromoterTable3)
+}
+
+promoterAnnotationByGene <- function(allhitsFiles, sampleNames, diffbindFiles=NA, direction=NA) {
+  # the main function
+  if ( length(allhitsFiles) != length(sampleNames) ) {
+    print("Number of allhits files and sample names don't match.")
+  } else {
+    if ( (length(allhitsFiles) != length(diffbindFiles)) & (sum(is.na(diffbindFiles)) != 1) ) {
+      print("Number of allhits files and diffbind files don't match.")
+    } else {
+      if ( length(allhitsFiles) == 1 ) {
+        promoterData <- allhits2promoter(allhitsFiles)
+        if (is.na(diffbindFiles)) {
+          promoterData$sample_id <- sampleNames
+        } else {
+          promoterData <- DiffbindFilterPromoter(diffbindFiles, promoterData, sampleNames, direction)
+        } 
+      } else {
+        for ( a in 1:length(allhitsFiles) ) {
+          print(a)
+          tmpA <- allhits2promoter(allhitsFiles[a])
+          if (sum(is.na(diffbindFiles)) ==1) {
+            tmpA$sample_id <- sampleNames[a]
+          } else {
+            tmpA <- DiffbindFilterPromoter(diffbindFiles[a], tmpA, sampleNames[a], direction)
+          }
+          if (a == 1) {
+            promoterData <- tmpA
+          } else {
+            promoterData <- rbind(promoterData, tmpA)
+          }
+        }
+      }
+    }
+    promoterTable <- createPromoterTable(promoterData)
+    return(promoterTable)
+  }   
+}     
+
+peakcallVersion <- function(inFolder,outFile) {
+# currently only works for macs outputs
+# inFolder here is the folder where the uropa output files are located
+  filesA <- list.files(path=inFolder,pattern="allhits.txt")
+  samples <- matrix(unlist(strsplit(filesA,"_macs")),ncol=2,byrow=T)[,1]
+  filesA <- list.files(path=inFolder,pattern="allhits.txt",full.names = T)
+  promoterInfo <- promoterAnnotationByGene(allhitsFiles=filesA, sampleNames=samples)
+  write.table(promoterInfo, outFile, quote=F,sep="\t",row.names=F)
+}
+
+diffbindVersion <- function(inFolder,outFile) {
+# currently designed for macs peaks, analyzed by deseq2
+# analyzing both positive and negative together for now
+# inFolder here is the root working directory for the project
+  uropaFolder <- paste0(inFolder, "/UROPA_annotations/DiffBind")
+  diffbindFolder <- paste0(inFolder, "/DiffBind")
+  filesU <- list.files(path=uropaFolder, pattern="DiffbindDeseq2_uropa_protTSS_allhits.txt")
+  samples <- matrix(unlist(strsplit(filesU,"-macs")),ncol=2,byrow=T)[,1]
+  filesU <- list.files(path=uropaFolder, pattern="DiffbindDeseq2_uropa_protTSS_allhits.txt",full.names=T)
+  filesD <- list.files(path=diffbindFolder, pattern="Deseq2.txt",full.names=T,recursive=T)
+  promoterInfo <- promoterAnnotationByGene(allhitsFiles=filesU, 
+                     sampleNames=samples, diffbindFiles=filesD, direction="both")
+  write.table(promoterInfo, outFile, quote=F,sep="\t",row.names=F)
+}
diff --git a/bin/significantPathways.R b/bin/significantPathways.R
new file mode 100755
index 0000000..dd9af36
--- /dev/null
+++ b/bin/significantPathways.R
@@ -0,0 +1,127 @@
+####################
+#
+# Name: significantPathways.R
+# Created by: Tovah Markowitz, PhD
+# Bioinformatics (NCBR)/ Integrated Data Sciences Section (IDSS)
+# Research Technologies Branch/DIR/NIAID
+#
+# Created: August 9, 2022
+# Updated: October 28, 2022 to make reactomePA optional
+# Updated: November 4, 2022 to accept a txt file or a gtf for the background genes
+#                           also to accept a promoter annotation table and analyze every column
+# 
+####################
+#
+# Purpose: To take a list of genes and find the significant KEGG or Reactome
+#          pathways using overenrichment analysis. See details for specialized functionality.
+#
+# Requires: clusterProfiler, ReactomePA, enrichplot, org.Hs.eg.db, rtracklayer, ggplot2, and ggprism
+#
+# Details: Takes input gene lists as Ensembl gene IDs or gene symbols, converts to
+#          Entrez gene IDs, and runs ORA against KEGG or Reactome database. Requires
+#          a background gene list as cfChIP currently ignores chrs X, Y, and M.
+#          Outputs a dataframe of significant pathways, a pdf of the top most
+#          significant pathways, or a pdf of just the pathways of interest (if significant).
+#
+# Function: significantPathways(Genes, bkgGeneTXT, database="KEGG", PDFfile=NA, pathwayVector=NA)
+#
+# Variables:
+#   Genes:         [Required] a vector of the genes to be analyzed through ORA
+#   bkgGeneFILE:   [Required] a txt file containing a column of Ensembl IDs listing 
+#                  the appropriate background gene set or the gtf file used for the uropa
+#                  annotations 
+#                  For example: hg19.ensembl.prot_coding.with_annotations.txt
+#   database:      [Optional] whether to compare to the KEGG or Reactome database
+#                  default: KEGG
+#   PDFfile:       [Optional] name of the PDF file to create, if empty no PDF will be made
+#   pathwayVector: [Optional] a vector of pathways (descriptions or IDs) to plot in the pdf.
+#                  If PDFfile is empty, it is ignored. If this is empty and PDFfile is not,
+#                  pdf plot will be of the top 30 most significant pathways instead.
+#
+# Example usage:
+#   source("significantPathways.R")
+#   out <- significantPathways(Genes= c("GeneA","GeneB"), 
+#                              bkgGeneFILE= "hg19.ensembl.prot_coding.with_annotations.txt",
+#                              database="KEGG", PDFfile="a.pdf", 
+#                              pathwayVector=c("pathwayA", "pathwayB"))
+# 
+####################
+
+library(clusterProfiler)
+library(enrichplot)
+library(ggplot2)
+library(ggprism)
+
+makeBarplotTop <- function(inData, titleName, PDFfile) {
+  inDataCount <- sum(inData@result$p.adjust < 0.1)
+  if (inDataCount > 30) { inDataCount = 30 } 
+  if (inDataCount > 0) {
+    pdf(PDFfile)
+    print(barplot(inData, showCategory = inDataCount,
+                  label_format=70, title=titleName, x="GeneRatio") + 
+            theme_prism(base_size =8) + theme(legend.title = element_text()) )
+  }
+  dev.off()
+}
+
+makeBarPlotSelect <- function(inData, titleName, PDFfile, categories) {
+  pdf(PDFfile)
+  print(barplot(inData, showCategory = categories,
+                label_format=70, title=titleName, x="GeneRatio") + 
+          theme_prism(base_size = 8) + theme(legend.title = element_text()) )
+  dev.off()
+  }
+
+processGenes <- function(geneIDs) {
+  if (grepl("^ENSG", geneIDs[1])) {
+    ensIDs <- gsub("\\.[0-9]+", "", geneIDs, perl=T)
+    entrezIDs <- bitr(ensIDs, from= "ENSEMBL", toType="ENTREZID", OrgDb="org.Hs.eg.db")
+  } else {
+    entrezIDs <- bitr(ensIDs, from= "SYMBOL", toType="ENTREZID", OrgDb="org.Hs.eg.db")
+  }
+  entrezIDs <- entrezIDs$ENTREZID
+  return(entrezIDs)
+}
+
+significantPathways <- function(Genes, bkgGeneFILE, database="KEGG", PDFfile=NA, pathwayVector=NA) {
+  sigGenes <- processGenes(Genes)
+  if (grepl("gtf",bkgGeneFILE)) {
+    bkgGenesData <- rtracklayer::import(bkgGeneFILE)
+    bkgGenes <- unique(bkgGenesData$gene_id)
+  } else {
+    bkgGenesData <- read.delim(bkgGeneFILE)
+    bkgGenes <- bkgGenesData[,grep("^ENSG", bkgGenesData[1,])]
+  }
+  backgroundGenes <- processGenes(bkgGenes)
+  if (database == "KEGG") {
+    pathwaySig <- enrichKEGG(sigGenes, organism= "hsa", keyType="kegg", universe=backgroundGenes, use_internal_data=TRUE)
+    pathwayData <- pathwaySig@result[which(pathwaySig@result$p.adjust < 0.1),]
+  } else {
+    library(ReactomePA)
+    pathwaySig <- enrichPathway(sigGenes, readable=T, universe=backgroundGenes)
+    pathwayData <- pathwaySig@result[which(pathwaySig@result$p.adjust < 0.1),]
+  }
+  if (!is.na(PDFfile)) {
+    if(length(pathwayVector) != 1) {
+      if (length(grep("HSA", pathwayVector, ignore.case=T)) != 0) {
+        pathwayVector <- pathwayData$Description[which(pathwayData$ID %in% pathwayVector)]
+      }
+      makeBarPlotSelect(inData=pathwaySig, titleName=database, PDFfile=PDFfile, categories=pathwayVector)
+    } else {
+      makeBarplotTop(inData=pathwaySig, titleName=database, PDFfile=PDFfile)
+    }
+  }
+  return(pathwayData)
+}
+
+promoterAnnotationWrapper <- function(promoterFile, bkgGeneFILE, database="KEGG") {
+   promoterData <- read.delim(promoterFile)
+   outFolder <- dirname(promoterFile)
+   for (i in 3:ncol(promoterData)) {
+      colName <- names(promoterData)[i]
+      Genes <- promoterData$gene_id[which(promoterData[,i] > 0)]
+      outData <- significantPathways(Genes, bkgGeneFILE, database)
+      outFileName <- paste0(outFolder,"/",colName,"_",database,".txt")
+      write.table(outData, outFileName, quote=F, row.names=F, sep="\t")
+   }
+}
diff --git a/workflow/Snakefile b/workflow/Snakefile
index 6d1bce7..240f016 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -14,6 +14,7 @@ configfile: "config.json"
 # Global workflow variables
 today                           = str(datetime.datetime.today()).split()[0].replace('-', '') # YYYYMMDD
 samples                         = config['samples']
+bin_path                        = config['project']['binpath']
 workpath                        = config['project']['workpath']
 assay                           = config['options']['assay']
 paired_end                      = False if config['project']['nends'] == 1 else True
diff --git a/workflow/rules/cfChIP.smk b/workflow/rules/cfChIP.smk
index 672a05b..4730054 100644
--- a/workflow/rules/cfChIP.smk
+++ b/workflow/rules/cfChIP.smk
@@ -5,117 +5,128 @@
 
 # ~~ workflow configuration
 workpath                        = config['project']['workpath']
+bin_path                        = config['project']['binpath']
 genome                          = config['options']['genome']
 blocks                          = config['project']['blocks']
 groupdata                       = config['project']['groups']
 
-
-# ~~ directories
+# Directory end points
+bam_dir                         = join(workpath, "bam")
+cfTool_dir                      = join(workpath, "cfChIPtool")
+cfTool_subdir2                  = join(cfTool_dir, "BED", "H3K4me3")
+qc_dir                          = join(workpath, "QC")
 
 
 rule cfChIPtool:
     input: 
-        out5=join(workpath,bam_dir,"{name}.Q5DD.bam.idxstat"),
+        out5                    = join(bam_dir, "{name}.Q5DD.bam.idxstat"),
     output:
-        out1=join(workpath,cfTool_subdir2,"{name}.Q5DD.tagAlign.gz"),
-        out2=join(workpath,cfTool_dir,"Output","H3K4me3","Signatures","{name}.Q5DD.csv"),
-        out3=join(workpath,cfTool_dir,"Samples","H3K4me3","{name}.Q5DD.rdata"),
+        out1                    = join(cfTool_subdir2, "{name}.Q5DD.tagAlign.gz"),
+        out2                    = join(cfTool_dir, "Output", "H3K4me3", "Signatures", "{name}.Q5DD.csv"),
+        out3                    = join(cfTool_dir, "Samples", "H3K4me3", "{name}.Q5DD.rdata"),
     params:
-        rname='cfChiP',
-        rver="R/4.1.0",
-        toolkit = config['references'][genome]['cfChIP_TOOLS_SRC'],
-        tmpfile = lambda w: join(workpath,cfTool_subdir2, w.name + ".Q5DD.tagAlign"),
-        tag=lambda w: temp(join(workpath,bam_dir, w.name+".Q5DD_tagAlign"))
+        rname                   = 'cfChiP',
+        rver                    = "R/4.1.0",
+        toolkit                 = config['references'][genome]['cfChIP_TOOLS_SRC'],
+        tmpfile                 = lambda w: join(cfTool_subdir2, w.name + ".Q5DD.tagAlign"),
+        tag                     = lambda w: temp(join(bam_dir, w.name + ".Q5DD_tagAlign"))
     container:
         config['images']['cfchip']
-    shell: """
-    cp {params.tag} {params.tmpfile}
-    gzip {params.tmpfile}
+    shell: 
+        """
+        cp {params.tag} {params.tmpfile}
+        gzip {params.tmpfile}
 
-    Rscript {params.toolkit}/ProcessBEDFiles.R \\
-        -a {params.toolkit}/SetupFiles/H3K4me3 \\
-        -r {cfTool_dir} \\
-        -p {cfTool_dir} \\
-        -m H3K4me3 \\
-        -S {output.out1}
-    """
+        Rscript {params.toolkit}/ProcessBEDFiles.R \\
+            -a {params.toolkit}/SetupFiles/H3K4me3 \\
+            -r {cfTool_dir} \\
+            -p {cfTool_dir} \\
+            -m H3K4me3 \\
+            -S {output.out1}
+        """
 
 
 rule cfChIPcompile:
     input:
         expand(join(cfTool_dir, "Output", "H3K4me3", "Signatures", "{name}.Q5DD.csv"), name=chips)
     output:
-        txt=join(workpath,"QC","H3K4me3_cfChIP_signature.txt"),
-        pdf=join(workpath,"QC","H3K4me3_cfChIP_signature.pdf")
+        txt                     = join(qc_dir, "H3K4me3_cfChIP_signature.txt"),
+        pdf                     = join(qc_dir, "H3K4me3_cfChIP_signature.pdf")
     params:
-        rname="cfChIP2",
-        script=join(workpath,"workflow","scripts","cfChIP_signatures.R"),
-        infolder=join(workpath,cfTool_dir,"Output","H3K4me3","Signatures"),
+        rname                   = "cfChIP2",
+        script                  = join(bin_path, "cfChIP_signatures.R"),
+        infolder                = join(cfTool_dir, "Output", "H3K4me3", "Signatures"),
     container:
         config['images']['cfchip']
-    shell: """
-    Rscript -e "source('{params.script}'); mergeSignatures( '{params.infolder}', '{output.txt}' )";
-    Rscript -e "source('{params.script}'); plotSignatures( '{output.txt}', '{output.pdf}' )";
-    """
+    shell: 
+        """
+        Rscript -e "source('{params.script}'); mergeSignatures( '{params.infolder}', '{output.txt}' )";
+        Rscript -e "source('{params.script}'); plotSignatures( '{output.txt}', '{output.pdf}' )";
+        """
+
 
 rule promoterTable1:
     input:
-        expand(join(workpath,uropa_dir,'{PeakTool}','{name}_{PeakTool}_uropa_protTSS_allhits.txt'),PeakTool=PeakTools,name=chips),
+        expand(join(workpath,uropa_dir,'{PeakTool}','{name}_{PeakTool}_uropa_protTSS_allhits.txt'), PeakTool=PeakTools, name=chips),
     output:
-        txt=join(workpath,uropa_dir,"promoterTable1",'{PeakTool}_promoter_overlap_summaryTable.txt')
+        txt                     = join(uropa_dir, "promoterTable1", "{PeakTool}_promoter_overlap_summaryTable.txt")
     params:
-        rname="promoter1",
-        script=join(workpath,"workflow","scripts","promoterAnnotation_by_Gene.R"),
-        infolder= join(workpath,uropa_dir, '{PeakTool}')
+        rname                   = "promoter1",
+        script                  = join(bin_path, "promoterAnnotation_by_Gene.R"),
+        infolder                = join(uropa_dir, '{PeakTool}')
     container:
         config['images']['cfchip']
-    shell: """
-    Rscript -e "source('{params.script}'); peakcallVersion('{params.infolder}','{output.txt}')";
-    """
+    shell: 
+        """
+        Rscript -e "source('{params.script}'); peakcallVersion('{params.infolder}','{output.txt}')";
+        """
+
 
 rule promoterTable2:
     input:
-        expand(join(workpath,uropa_dir,diffbind_dir,'{name}_DiffbindDeseq2_uropa_protTSS_allhits.txt'), name=contrasts),
+        expand(join(diffbind_dir, '{name}_DiffbindDeseq2_uropa_protTSS_allhits.txt'), name=contrasts),
     output:
-        txt=join(workpath,uropa_dir,"promoterTable2",'DiffbindDeseq2_{PeakTool}_promoter_overlap_summaryTable.txt'),
+        txt                     = join(workpath,uropa_dir,"promoterTable2",'DiffbindDeseq2_{PeakTool}_promoter_overlap_summaryTable.txt'),
     params:
-        rname="promoter2",
-        script1=join(workpath,"workflow","scripts","promoterAnnotation_by_Gene.R"),
-        script2=join(workpath,"workflow","scripts","significantPathways.R"),
-        infolder= workpath,
-        gtf = config['references'][genome]['GTFFILE'],
+        rname                   = "promoter2",
+        script1                 = join(bin_path, "promoterAnnotation_by_Gene.R"),
+        script2                 = join(bin_path, "significantPathways.R"),
+        infolder                = workpath,
+        gtf                     = config['references'][genome]['GTFFILE'],
     container:
         config['images']['cfchip']
-    shell: """
-    Rscript -e "source('{params.script1}'); diffbindVersion('{params.infolder}','{output.txt}')";
-    Rscript -e "source('{params.script2}'); promoterAnnotationWrapper('{output.txt}','{params.gtf}','KEGG')";
-    Rscript -e "source('{params.script2}'); promoterAnnotationWrapper('{output.txt}','{params.gtf}','Reactome')";
-    """
+    shell: 
+        """
+        Rscript -e "source('{params.script1}'); diffbindVersion('{params.infolder}','{output.txt}')";
+        Rscript -e "source('{params.script2}'); promoterAnnotationWrapper('{output.txt}','{params.gtf}','KEGG')";
+        Rscript -e "source('{params.script2}'); promoterAnnotationWrapper('{output.txt}','{params.gtf}','Reactome')";
+        """
 
 rule diffbindQC:
     input:
-       lambda w: [ join(workpath, w.PeakTool, chip, chip + PeakExtensions[w.PeakTool]) for chip in chips ]
+        lambda w: [ join(workpath, w.PeakTool, chip, chip + PeakExtensions[w.PeakTool]) for chip in chips ]
     output:
-       html = join(workpath, "QC", "AllSamples-{PeakTool}", "AllSamples-{PeakTool}_DiffBindQC.html"),
-       bed = join(workpath, "QC", "AllSamples-{PeakTool}", "AllSamples-{PeakTool}_DiffBindQC_TMMcounts.bed"),
+        html                    = join(qc_dir, "AllSamples-{PeakTool}", "AllSamples-{PeakTool}_DiffBindQC.html"),
+        bed                     = join(workpath, "QC", "AllSamples-{PeakTool}", "AllSamples-{PeakTool}_DiffBindQC_TMMcounts.bed"),
     params:
-       rname="diffbindQC",
-       rscript=join(workpath,"workflow","scripts","DiffBind_v2_cfChIP_QC.Rmd"),
-       outdir = join(workpath, "QC", "AllSamples-{PeakTool}"),
-       contrast = "AllSamples",
-       csvfile = join(workpath, "QC", "AllSamples-{PeakTool}", "AllSamples-{PeakTool}_DiffBind_prep.csv"),
-       pythonscript = join(workpath,"workflow","scripts","prep_diffbindQC.py"),
-       PeakExtension= lambda w: PeakExtensions[w.PeakTool],
-       PeakTool="{PeakTool}",
-       peakcaller= lambda w: FileTypesDiffBind[w.PeakTool],
+       rname                    = "diffbindQC",
+       contrast                 = "AllSamples",
+       PeakTool                 = "{PeakTool}",
+       rscript                  = join(bin_path, "DiffBind_v2_cfChIP_QC.Rmd"),
+       outdir                   = join(qc_dir, "AllSamples-{PeakTool}"),
+       csvfile                  = join(qc_dir, "AllSamples-{PeakTool}", "AllSamples-{PeakTool}_DiffBind_prep.csv"),
+       pythonscript             = join(bin_path, "prep_diffbindQC.py"),
+       PeakExtension            = lambda w: PeakExtensions[w.PeakTool],
+       peakcaller               = lambda w: FileTypesDiffBind[w.PeakTool],
     container:
        config['images']['cfchip']
-    shell: """
-       python {params.pythonscript} --wp {workpath} \
-         --pt {params.PeakTool} --pe {params.PeakExtension} --bd {bam_dir} \
-         --pc {params.peakcaller} --csv {params.csvfile}
-       cp {params.rscript} {params.outdir}
-       cd {params.outdir}
-       Rscript -e 'rmarkdown::render("DiffBind_v2_cfChIP_QC.Rmd", output_file= "{output.html}", 
-           params=list(csvfile= "{params.csvfile}", contrasts= "{params.contrast}", peakcaller= "{params.PeakTool}"))'
-    """
\ No newline at end of file
+    shell: 
+        """
+        python {params.pythonscript} --wp {workpath} \
+            --pt {params.PeakTool} --pe {params.PeakExtension} --bd {bam_dir} \
+            --pc {params.peakcaller} --csv {params.csvfile}
+        cp {params.rscript} {params.outdir}
+        cd {params.outdir}
+        Rscript -e 'rmarkdown::render("DiffBind_v2_cfChIP_QC.Rmd", output_file= "{output.html}", 
+            params=list(csvfile= "{params.csvfile}", contrasts= "{params.contrast}", peakcaller= "{params.PeakTool}"))'
+        """
\ No newline at end of file
diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk
index 894c0f4..b39042f 100644
--- a/workflow/rules/qc.smk
+++ b/workflow/rules/qc.smk
@@ -8,6 +8,7 @@ from scripts.common import get_bam_ext, get_fqscreen_outputs
 
 # ~~ workflow configuration
 workpath                        = config['project']['workpath']
+bin_path                        = config['project']['binpath']
 genome                          = config['options']['genome']
 paired_end                      = False if config['project']['nends'] == 1 else True
 samples                         = config['samples']
@@ -17,6 +18,7 @@ ends                            = [1] if not paired_end else [1, 2]
 qc_dir                          = join(workpath, "QC")
 kraken_dir                      = join(workpath, 'kraken')
 deeptools_dir                   = join(workpath, 'deeptools')
+peakqc_dir                      = join(workpath, "PeakQC")
 extra_fingerprint_dir           = join(deeptools_dir, 'sorted_fingerprint')
 
 
@@ -387,26 +389,28 @@ rule FRiP:
     output:
         join(workpath,"PeakQC","{PeakTool}.{name}.Q5DD.FRiP_table.txt"),
     params:
-        rname="frip",
-        outroot = lambda w: join(workpath,"PeakQC",w.PeakTool),
-        script=join(workpath,"workflow","scripts","frip.py"),
-        genome = config['references'][genome]['REFLEN'],
-        tmpdir = tmpdir,
-    container: config['images']['python']
-    shell: """
-    # Setups temporary directory for
-    # intermediate files with built-in 
-    # mechanism for deletion on exit
-    if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi
-    tmp=$(mktemp -d -p "{params.tmpdir}")
-    trap 'rm -rf "${{tmp}}"' EXIT
+        rname                   = "frip",
+        outroot                 = lambda w: join(peakqc_dir, w.PeakTool),
+        script                  = join(bin_path, "frip.py"),
+        genome                  = config['references'][genome]['REFLEN'],
+        tmpdir                  = tmpdir,
+    container: 
+        config['images']['python']
+    shell: 
+        """
+        # Setups temporary directory for
+        # intermediate files with built-in 
+        # mechanism for deletion on exit
+        if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi
+        tmp=$(mktemp -d -p "{params.tmpdir}")
+        trap 'rm -rf "${{tmp}}"' EXIT
 
-    python {params.script} \\
-        -p {input.bed} \\
-        -b {input.bam} \\
-        -g {params.genome} \\
-        -o {params.outroot}
-    """
+        python {params.script} \\
+            -p {input.bed} \\
+            -b {input.bam} \\
+            -g {params.genome} \\
+            -o {params.outroot}
+        """
 
 rule jaccard:
     input:
@@ -414,15 +418,17 @@ rule jaccard:
     output:
         join(qc_dir, '{PeakTool}_jaccard.txt'),
     params:
-        rname="jaccard",
-        outroot = lambda w: join(qc_dir, w.PeakTool),
-        script=join(workpath,"workflow","scripts","jaccard_score.py"),
-        genome = config['references'][genome]['REFLEN']
+        rname                   = "frip",
+        rname                   = "jaccard",
+        outroot                 = lambda w: join(qc_dir, w.PeakTool),
+        script                  = join(bin_path, "jaccard_score.py"),
+        genome                  = config['references'][genome]['REFLEN']
     envmodules:
         config['tools']['BEDTOOLSVER']
-    shell: """
-    python {params.script} \\
-        -i "{input}" \\
-        -o "{params.outroot}" \\
-        -g {params.genome}
-    """
+    shell: 
+        """
+        python {params.script} \\
+            -i "{input}" \\
+            -o "{params.outroot}" \\
+            -g {params.genome}
+        """
diff --git a/workflow/scripts/common.py b/workflow/scripts/common.py
index 26f44b6..6feba1c 100644
--- a/workflow/scripts/common.py
+++ b/workflow/scripts/common.py
@@ -258,14 +258,4 @@ def get_fqscreen_outputs(paired_end, samples, qc_dir):
         outs.extend(expand(join(qc_dir, "FQscreen", "{name}.R1.trim_screen.png"), name=samples)),
         outs.extend(expand(join(qc_dir, "FQscreen2", "{name}.R1.trim_screen.txt"), name=samples)),
         outs.extend(expand(join(qc_dir, "FQscreen2", "{name}.R1.trim_screen.png"), name=samples)),
-    return outs
-
-
-def test_combine(one, two):
-    try:
-        three = one + two
-    except:
-        print(one)
-        print(two)
-        exit()
-    return three
\ No newline at end of file
+    return outs
\ No newline at end of file

From 07eb85fde045073c3b6563a08d60b306563d71cd Mon Sep 17 00:00:00 2001
From: Ryan Routsong <routsongrm@nih.gov>
Date: Wed, 10 Jul 2024 11:39:00 -0400
Subject: [PATCH 05/28] fix: remove old imports, duplicate parameters

---
 workflow/rules/dba.smk |  2 +-
 workflow/rules/qc.smk  | 29 +++++++++++++++--------------
 2 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/workflow/rules/dba.smk b/workflow/rules/dba.smk
index d767f57..e20c48a 100644
--- a/workflow/rules/dba.smk
+++ b/workflow/rules/dba.smk
@@ -3,7 +3,7 @@
 import os
 import json
 from os.path import join
-from scripts.common import allocated, mk_dir_if_not_exist, test_combine
+from scripts.common import allocated, mk_dir_if_not_exist
 from scripts.peakcall import outputIDR, zip_peak_files, calc_effective_genome_fraction
 from scripts.blocking import test_for_block
 
diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk
index b39042f..26861ef 100644
--- a/workflow/rules/qc.smk
+++ b/workflow/rules/qc.smk
@@ -366,21 +366,23 @@ rule deeptools_QC:
     input:
         [ join(workpath, bw_dir, name + ".Q5DD.RPGC.bw") for name in samples ] # this should be all bigwigs
     output:
-        heatmap=join(deeptools_dir, "spearman_heatmap.Q5DD.pdf"),
-        pca=join(deeptools_dir, "pca.Q5DD.pdf"),
-	    npz=temp(join(deeptools_dir, "Q5DD.npz")),
-	    png=join(deeptools_dir, "spearman_heatmap.Q5DD_mqc.png")
+        javaram                 = '16g',
+        heatmap                 = join(deeptools_dir, "spearman_heatmap.Q5DD.pdf"),
+        pca                     = join(deeptools_dir, "pca.Q5DD.pdf"),
+	    npz                     = temp(join(deeptools_dir, "Q5DD.npz")),
+	    png                     = join(deeptools_dir, "spearman_heatmap.Q5DD_mqc.png")
     params:
-        rname="deeptools_QC",
-        deeptoolsver=config['tools']['DEEPTOOLSVER'],
+        rname                   = "deeptools_QC",
+        deeptoolsver            = config['tools']['DEEPTOOLSVER'],
         labels=samples # this should be the sample names to match the bigwigs in the same order
-    shell: """    
-    module load {params.deeptoolsver}
-    multiBigwigSummary bins -b {input} -l {params.labels} -out {output.npz}
-    plotCorrelation -in {output.npz} -o {output.heatmap} -c 'spearman' -p 'heatmap' --skipZeros --removeOutliers
-    plotCorrelation -in {output.npz} -o {output.png} -c 'spearman' -p 'heatmap' --skipZeros --removeOutliers
-    plotPCA -in {output.npz} -o {output.pca}
-    """
+    shell: 
+        """    
+        module load {params.deeptoolsver}
+        multiBigwigSummary bins -b {input} -l {params.labels} -out {output.npz}
+        plotCorrelation -in {output.npz} -o {output.heatmap} -c 'spearman' -p 'heatmap' --skipZeros --removeOutliers
+        plotCorrelation -in {output.npz} -o {output.png} -c 'spearman' -p 'heatmap' --skipZeros --removeOutliers
+        plotPCA -in {output.npz} -o {output.pca}
+        """
 
 rule FRiP:
     input:
@@ -418,7 +420,6 @@ rule jaccard:
     output:
         join(qc_dir, '{PeakTool}_jaccard.txt'),
     params:
-        rname                   = "frip",
         rname                   = "jaccard",
         outroot                 = lambda w: join(qc_dir, w.PeakTool),
         script                  = join(bin_path, "jaccard_score.py"),

From 83b0e5c68127bd7b42d530d0cf3bd1c2448f2641 Mon Sep 17 00:00:00 2001
From: Ryan Routsong <routsongrm@nih.gov>
Date: Wed, 10 Jul 2024 12:55:41 -0400
Subject: [PATCH 06/28] fix: correct typo type to _type

---
 workflow/Snakefile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/workflow/Snakefile b/workflow/Snakefile
index 240f016..dc011b8 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -94,7 +94,7 @@ if assay == "cfchip":
             group1=zipGroup1, group2=zipGroup2, PeakTool=zipToolC
         ))
         rule_all_ins.extend(expand(
-            join(uropa_dir, "{name}_{PeakTool}_uropa_{type}_allhits.txt"), PeakTool=['DiffbindEdgeR','DiffbindDeseq2'], 
+            join(uropa_dir, "{name}_{PeakTool}_uropa_{_type}_allhits.txt"), PeakTool=['DiffbindEdgeR','DiffbindDeseq2'], 
             name=contrasts, _type=["protTSS"]
         ))
 
@@ -121,7 +121,7 @@ if assay == "cfchip":
             ))
         if reps:
             rule_all_ins.extend(expand(
-                join(uropa_dir, "{PeakTool}", "{name}_{PeakTool}_uropa_{type}_allhits.txt"),
+                join(uropa_dir, "{PeakTool}", "{name}_{PeakTool}_uropa_{_type}_allhits.txt"),
                 PeakTool=PeakTools, name=chips, _type=peak_types
             ))
             rule_all_ins.extend(expand(
@@ -129,7 +129,7 @@ if assay == "cfchip":
                 group1=zipGroup1, group2=zipGroup2, PeakTool=zipToolC
             ))
             rule_all_ins.extend(expand(
-                join(uropa_dir, "{name}_{PeakTool}_uropa_{type}_allhits.txt"), 
+                join(uropa_dir, "{name}_{PeakTool}_uropa_{_type}_allhits.txt"), 
                 PeakTool=["DiffbindEdgeR", "DiffbindDeseq2"], 
                 name=contrasts, 
                 _type=peak_types

From 543121be62664525af9a0d58ea8f1ec20add6717 Mon Sep 17 00:00:00 2001
From: Ryan Routsong <routsongrm@nih.gov>
Date: Wed, 10 Jul 2024 17:35:01 -0400
Subject: [PATCH 07/28] fix: correct pathing on diffbind outputs

---
 workflow/Snakefile     |  8 +++++---
 workflow/rules/dba.smk | 16 +++++++++-------
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/workflow/Snakefile b/workflow/Snakefile
index dc011b8..e025c45 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -49,6 +49,7 @@ macsB_dir                       = join(workpath, "macsBroad")
 sicer_dir                       = join(workpath, "sicer")
 peakqc_dir                      = join(workpath, "PeakQC")
 uropa_dir                       = join(workpath, "UROPA_annotations")
+uropa_diffbind_dir              = join(uropa_dir, "DiffBind")
 diffbind_dir                    = join(workpath, "DiffBind")
 cfTool_dir                      = join(workpath, "cfChIPtool")
 genrich_dir                     = join(workpath, "Genrich")
@@ -94,10 +95,11 @@ if assay == "cfchip":
             group1=zipGroup1, group2=zipGroup2, PeakTool=zipToolC
         ))
         rule_all_ins.extend(expand(
-            join(uropa_dir, "{name}_{PeakTool}_uropa_{_type}_allhits.txt"), PeakTool=['DiffbindEdgeR','DiffbindDeseq2'], 
-            name=contrasts, _type=["protTSS"]
+            join(uropa_diffbind_dir, "{PeakTool}", "{name}_{PeakTool}_uropa_{_type}_allhits.txt"), 
+            PeakTool=['DiffbindEdgeR','DiffbindDeseq2'], 
+            name=contrasts,
+             _type=["protTSS"]
         ))
-
     elif assay in ["atac", "chip"]:
         peak_types.extend(["prot", "protSEC", "genes"])
         rule_all_ins.extend(expand(join(MEME_dir, "{PeakTool}", "{name}_meme", "meme-chip.html"), PeakTool=PeakTools, name=chips))
diff --git a/workflow/rules/dba.smk b/workflow/rules/dba.smk
index e20c48a..eb8d741 100644
--- a/workflow/rules/dba.smk
+++ b/workflow/rules/dba.smk
@@ -23,6 +23,8 @@ bin_path                        = join(workpath, "workflow", "bin")
 diffbind_dir_block              = join(workpath, "DiffBindBlock")
 diffbind_dir2                   = join(workpath, "DiffBind_block")
 diffbind_dir                    = join(workpath, "DiffBind")
+uropa_dir                       = join(workpath, "UROPA_annotations")
+uropa_diffbind_dir              = join(uropa_dir, "DiffBind")
 bam_dir                         = join(workpath, "bam")
 qc_dir                          = join(workpath, "PeakQC")
 idr_dir                         = join(workpath, "IDR")
@@ -93,11 +95,11 @@ rule diffbind:
         lambda w: [ join(workpath, w.PeakTool, chip, chip + PeakExtensions[w.PeakTool]) for chip in chips ]
     output:
         html                            = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind.html"),
-        Deseq2                          = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2.bed"),
-        EdgeR                           = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR.bed"),
-        EdgeR_txt                       = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR.txt"),
-        Deseq2_txt                      = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2.txt"),
-        EdgeR_ftxt                      = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR_fullList.txt"),
+        Deseq2                          = join(diffbind_dir, "DiffbindDeseq2", "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2.bed"),
+        EdgeR                           = join(diffbind_dir, "DiffbindEdgeR", "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR.bed"),
+        EdgeR_txt                       = join(diffbind_dir, "DiffbindEdgeR", "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR.txt"),
+        Deseq2_txt                      = join(diffbind_dir, "DiffbindDeseq2", "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2.txt"),
+        EdgeR_ftxt                      = join(diffbind_dir, "DiffbindEdgeR", "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR_fullList.txt"),
         Deseq2_ftxt                     = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2_fullList.txt"),
         html_block                      = provided(join(diffbind_dir_block, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_blocking.html"), blocking)
     params:
@@ -141,11 +143,11 @@ rule diffbind:
         cp {params.rscript} {params.outdir}
         cd {params.outdir}
         Rscript -e 'rmarkdown::render("DiffBind_v2_ChIPseq.Rmd", output_file= "{output.html}", 
-        params=list(csvfile= "{params.csvfile}", contrasts= "{params.contrast}", peakcaller= "{params.this_peaktool}"))'
+        params=list(csvfile= "{params.csvfile}", contrasts= "{params.this_contrast}", peakcaller= "{params.this_peaktool}"))'
         if [ ! -f {output.Deseq2} ]; then touch {output.Deseq2}; fi
         if [ ! -f {output.EdgeR} ]; then touch {output.EdgeR}; fi
 
-        if [ '{params.blocking}' == True ]; then
+        if [ '"""+str(blocking)+"""' == True ]; then
             echo "DiffBind with Blocking"
             Rscript -e 'rmarkdown::render("{params.blocking_rscript}", output_file= "{output.html_block}", 
             params=list(csvfile= "{params.csvfile}", contrasts= "{params.this_contrast}", peakcaller= "{params.this_peaktool}", dir= "{params.outdir_block}"))'

From e5027d8b4de61df2fb05f5b4cceefeecb521c35b Mon Sep 17 00:00:00 2001
From: Ryan Routsong <routsongrm@nih.gov>
Date: Thu, 11 Jul 2024 10:53:28 -0400
Subject: [PATCH 08/28] fix: missing imports in peakcall rules

---
 workflow/rules/peakcall.smk | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/workflow/rules/peakcall.smk b/workflow/rules/peakcall.smk
index 466d639..1fce9b2 100644
--- a/workflow/rules/peakcall.smk
+++ b/workflow/rules/peakcall.smk
@@ -4,7 +4,8 @@
 # Common quality-control rules: preseq, NRF, rawfastqc,
 #   fastqc, fastq_screen, multiQC
 from os.path import join
-from scripts.peakcall import get_control_input, getMacTXT, getMacChip
+from scripts.peakcall import get_control_input, getMacTXT, getMacChip,
+    getSicerChips, getSicerFragLen, get_control_input
 
 
 # ~~ workflow configuration

From 0e7ba2ec1a12e166fb4a738d42ba1598c2305bfe Mon Sep 17 00:00:00 2001
From: Ryan Routsong <routsongrm@nih.gov>
Date: Thu, 11 Jul 2024 10:58:26 -0400
Subject: [PATCH 09/28] fix: indent

---
 workflow/rules/peakcall.smk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/rules/peakcall.smk b/workflow/rules/peakcall.smk
index 1fce9b2..cdc1762 100644
--- a/workflow/rules/peakcall.smk
+++ b/workflow/rules/peakcall.smk
@@ -4,7 +4,7 @@
 # Common quality-control rules: preseq, NRF, rawfastqc,
 #   fastqc, fastq_screen, multiQC
 from os.path import join
-from scripts.peakcall import get_control_input, getMacTXT, getMacChip,
+from scripts.peakcall import get_control_input, getMacTXT, getMacChip, \
     getSicerChips, getSicerFragLen, get_control_input
 
 

From 8f6cedd3c7800a064715dc1adfe70683667eddc5 Mon Sep 17 00:00:00 2001
From: Ryan Routsong <routsongrm@nih.gov>
Date: Thu, 11 Jul 2024 11:11:49 -0400
Subject: [PATCH 10/28] fix: fix single end functionality in bwa rule

---
 workflow/rules/trim_align_dedup.smk | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/workflow/rules/trim_align_dedup.smk b/workflow/rules/trim_align_dedup.smk
index 0f1b458..8eae5cd 100644
--- a/workflow/rules/trim_align_dedup.smk
+++ b/workflow/rules/trim_align_dedup.smk
@@ -42,7 +42,7 @@ rule trim:
     """
     input:
         file1                               = join(workpath, "{name}.R1.fastq.gz"),
-        file2                               = provided(join(workpath,"{name}.R2.fastq.gz"), paired_end)
+        file2                               = provided(join(workpath, "{name}.R2.fastq.gz"), paired_end)
     output:
         outfq1                              = temp(join(trim_dir, "{name}.R1.trim.fastq.gz")),
         outfq2                              = provided(temp(join(trim_dir, "{name}.R2.trim.fastq.gz")), paired_end)
@@ -171,7 +171,7 @@ rule BWA:
     """
     input:
         infq1                               = join(trim_dir, "{name}.R1.trim.fastq.gz"),
-        infq2                               = join(trim_dir, "{name}.R2.trim.fastq.gz"),
+        infq2                               = join(trim_dir, "{name}.R2.trim.fastq.gz") if paired_end else [],
     params:
         d                                   = join(bam_dir),
         rname                               = 'bwa',

From c4e32cdd5e2e1cd57e59f9e659f468e34b287d99 Mon Sep 17 00:00:00 2001
From: Ryan Routsong <routsongrm@nih.gov>
Date: Thu, 11 Jul 2024 13:59:52 -0400
Subject: [PATCH 11/28] chore: spacing, refactor manorm rule

---
 workflow/rules/cfChIP.smk           |   2 +
 workflow/rules/dba.smk              |  79 ++---
 workflow/rules/peakcall.smk         | 470 ++++++++++++++--------------
 workflow/rules/qc.smk               |  88 +++---
 workflow/rules/trim_align_dedup.smk |  13 +-
 workflow/scripts/peakcall.py        |   9 +-
 6 files changed, 350 insertions(+), 311 deletions(-)

diff --git a/workflow/rules/cfChIP.smk b/workflow/rules/cfChIP.smk
index 4730054..665ef2f 100644
--- a/workflow/rules/cfChIP.smk
+++ b/workflow/rules/cfChIP.smk
@@ -10,6 +10,7 @@ genome                          = config['options']['genome']
 blocks                          = config['project']['blocks']
 groupdata                       = config['project']['groups']
 
+
 # Directory end points
 bam_dir                         = join(workpath, "bam")
 cfTool_dir                      = join(workpath, "cfChIPtool")
@@ -102,6 +103,7 @@ rule promoterTable2:
         Rscript -e "source('{params.script2}'); promoterAnnotationWrapper('{output.txt}','{params.gtf}','Reactome')";
         """
 
+
 rule diffbindQC:
     input:
         lambda w: [ join(workpath, w.PeakTool, chip, chip + PeakExtensions[w.PeakTool]) for chip in chips ]
diff --git a/workflow/rules/dba.smk b/workflow/rules/dba.smk
index eb8d741..80265a4 100644
--- a/workflow/rules/dba.smk
+++ b/workflow/rules/dba.smk
@@ -4,7 +4,7 @@ import os
 import json
 from os.path import join
 from scripts.common import allocated, mk_dir_if_not_exist
-from scripts.peakcall import outputIDR, zip_peak_files, calc_effective_genome_fraction
+from scripts.peakcall import outputIDR, zip_peak_files, calc_effective_genome_fraction, get_manorm_sizes
 from scripts.blocking import test_for_block
 
 
@@ -214,7 +214,7 @@ rule UROPA:
                     this_q['distance'] = _d
                     json_construct['queries'].append(this_q)
             elif '{type}' == 'protSEC':
-                #   distance, feature.anchor
+                # distance, feature.anchor
                 query_values = (
                     ([3000, 1000], "start"), 
                     (3000,         "end"), 
@@ -239,39 +239,46 @@ rule UROPA:
 
 
 rule manorm:
-    input: 
-        bam1 = lambda w: join(workpath,bam_dir, groupdata[w.group1][0] + ".Q5DD.bam"),
-        bam2 = lambda w: join(workpath,bam_dir, groupdata[w.group2][0] + ".Q5DD.bam"),
-        ppqt = join(workpath,bam_dir, "Q5DD.ppqt.txt"),
-        peak1 = lambda w: join(workpath, w.tool, groupdata[w.group1][0], groupdata[w.group1][0] + PeakExtensions[w.tool]),
-        peak2 = lambda w: join(workpath, w.tool, groupdata[w.group2][0], groupdata[w.group2][0] + PeakExtensions[w.tool]),
+    input:
+        bam1                            = lambda w: join(bam_dir, groupdata[w.group1][0] + ".Q5DD.bam"),
+        bam2                            = lambda w: join(bam_dir, groupdata[w.group2][0] + ".Q5DD.bam"),
+        ppqt                            = join(bam_dir, "Q5DD.ppqt.txt"),
+        peak1                           = lambda w: join(workpath, w.tool, groupdata[w.group1][0], groupdata[w.group1][0] + PeakExtensions[w.tool]),
+        peak2                           = lambda w: join(workpath, w.tool, groupdata[w.group2][0], groupdata[w.group2][0] + PeakExtensions[w.tool]),
     output:
-        xls = join(workpath,manorm_dir,"{group1}_vs_{group2}-{tool}","{group1}_vs_{group2}-{tool}_all_MAvalues.xls"),
-        bed = temp(join(workpath,manorm_dir,"{group1}_vs_{group2}-{tool}","{group1}_vs_{group2}-{tool}_all_MA.bed")),
-        wigA = join(workpath,manorm_dir,"{group1}_vs_{group2}-{tool}","output_tracks","{group1}_vs_{group2}_A_values.wig.gz"),
-        wigM = join(workpath,manorm_dir,"{group1}_vs_{group2}-{tool}","output_tracks","{group1}_vs_{group2}_M_values.wig.gz"),
-        wigP = join(workpath,manorm_dir,"{group1}_vs_{group2}-{tool}","output_tracks","{group1}_vs_{group2}_P_values.wig.gz"),
+        xls                             = join(manorm_dir, "{group1}_vs_{group2}-{tool}","{group1}_vs_{group2}-{tool}_all_MAvalues.xls"),
+        bed                             = temp(join(manorm_dir, "{group1}_vs_{group2}-{tool}","{group1}_vs_{group2}-{tool}_all_MA.bed")),
+        wigA                            = join(manorm_dir, "{group1}_vs_{group2}-{tool}","output_tracks","{group1}_vs_{group2}_A_values.wig.gz"),
+        wigM                            = join(manorm_dir, "{group1}_vs_{group2}-{tool}","output_tracks","{group1}_vs_{group2}_M_values.wig.gz"),
+        wigP                            = join(manorm_dir, "{group1}_vs_{group2}-{tool}","output_tracks","{group1}_vs_{group2}_P_values.wig.gz"),
     params:
-        rname='manorm',
-        fldr = join(workpath,manorm_dir,"{group1}_vs_{group2}-{tool}"),
-        bedtoolsver=config['tools']['BEDTOOLSVER'],
-        sample1= lambda w: groupdata[w.group1][0],
-        sample2= lambda w: groupdata[w.group2][0],
-        manormver="manorm/1.1.4"
-    run:
-        commoncmd1 = "if [ ! -e /lscratch/$SLURM_JOBID ]; then mkdir /lscratch/$SLURM_JOBID; fi "
-        commoncmd2 = "cd /lscratch/$SLURM_JOBID; "
-        commoncmd3 = "module load {params.manormver}; module load {params.bedtoolsver}; "
-        cmd1 = "bamToBed -i {input.bam1} > bam1.bed; "
-        cmd2 = "bamToBed -i {input.bam2} > bam2.bed; "
-        cmd3 = "cut -f 1,2,3 {input.peak1} > peak1.bed; "
-        cmd4 = "cut -f 1,2,3 {input.peak2} > peak2.bed; "
-        file=list(map(lambda z:z.strip().split(),open(input.ppqt,'r').readlines()))
-        extsize1 = [ ppqt[1] for ppqt in file if ppqt[0] == params.sample1 ][0]
-        extsize2 = [ ppqt[1] for ppqt in file if ppqt[0] == params.sample2 ][0]
-        cmd5 = "manorm --p1 peak1.bed --p2 peak2.bed --r1 bam1.bed --r2 bam2.bed --s1 " + extsize1  + " --s2 " + extsize2 + " -o {params.fldr} --name1 '" + wildcards.group1 + "' --name2 '" + wildcards.group2 + "'; "
-        cmd6 = "gzip {params.fldr}/output_tracks/*wig; "
-        cmd7 = "mv {params.fldr}/" + wildcards.group1 + "_vs_" + wildcards.group2 + "_all_MAvalues.xls {output.xls}; "
-        cmd8 = "tail -n +2 {output.xls} | nl -w2 | awk -v OFS='\t' '{{print $2,$3,$4,$9$1,$6}}' > {output.bed}"
-        shell(commoncmd1)
-        shell( commoncmd2 + commoncmd3 + cmd1 + cmd2 + cmd3 + cmd4 + cmd5 + cmd6 + cmd7 + cmd8 )
\ No newline at end of file
+        rname                           = 'manorm',
+        fldr                            = join(manorm_dir, "{group1}_vs_{group2}-{tool}"),
+        bedtoolsver                     = config['tools']['BEDTOOLSVER'],
+        manormver                       = "manorm/1.1.4"
+        extsizes                        = lambda w, _in: get_manorm_sizes(w.group1, w.group2, groupdata, _in.ppqt)
+    shell:
+        """
+        if [ ! -e /lscratch/$SLURM_JOBID ]; then 
+            mkdir /lscratch/$SLURM_JOBID
+        fi
+        cd /lscratch/$SLURM_JOBID
+        module load {params.manormver}
+        module load {params.bedtoolsver}
+        bamToBed -i {input.bam1} > bam1.bed
+        bamToBed -i {input.bam2} > bam2.bed
+        cut -f 1,2,3 {input.peak1} > peak1.bed
+        cut -f 1,2,3 {input.peak2} > peak2.bed
+        manorm \
+            --p1 peak1.bed \
+            --p2 peak2.bed \
+            --r1 bam1.bed \
+            --r2 bam2.bed \
+            {params.extsizes} \
+            -o {params.fldr} \
+            --name1 {wildcards.group1} \
+            --name2 {wildcards.group2}
+        gzip {params.fldr}/output_tracks/*wig
+        mv {params.fldr}/{wildcards.group1}_vs_{wildcards.group2}_all_MAvalues.xls {output.xls}
+        tail -n +2 {output.xls} | nl -w2 | awk -v OFS='\t' '{{print $2,$3,$4,$9$1,$6}}' > {output.bed}
+        """
\ No newline at end of file
diff --git a/workflow/rules/peakcall.smk b/workflow/rules/peakcall.smk
index cdc1762..f7953ee 100644
--- a/workflow/rules/peakcall.smk
+++ b/workflow/rules/peakcall.smk
@@ -13,6 +13,7 @@ workpath                        = config['project']['workpath']
 genome                          = config['options']['genome']
 paired_end                      = False if config['project']['nends'] == 1 else True
 chip2input                      = config['project']['peaks']['inputs']
+tmpdir                          = config['options']['tmp_dir']
 
 # Directory end points
 bam_dir                         = join(workpath, "bam")
@@ -37,16 +38,19 @@ rule sortByRead:
     output:
         temp(join(bam_dir, "{name}.sortedByRead.bam"))
     params:
-        rname="sortByRead",
-        samtools=config['tools']['SAMTOOLSVER'],
-        mem=allocated("mem", "sortByRead", cluster)
-    threads: int(allocated("threads", "sortByRead", cluster))
-    shell: """
-    module load {params.samtools}
-    samtools sort {input} -n \\
-        -@ {threads} \\
-        -o {output}
-    """
+        rname                           = "sortByRead",
+        samtools                        = config['tools']['SAMTOOLSVER'],
+        mem                             = allocated("mem", "sortByRead", cluster)
+    threads: 
+        int(allocated("threads", "sortByRead", cluster))
+    shell: 
+        """
+        module load {params.samtools}
+        samtools sort {input} -n \\
+            -@ {threads} \\
+            -o {output}
+        """
+
 
 rule genrich:
     """
@@ -65,253 +69,257 @@ rule genrich:
     output: 
         join(genrich_dir, "{name}", "{name}.narrowPeak")
     params:
-        rname="genrich",
-        genrich_ver=config['tools']['GENRICHVER']
-    shell: """
-    module load {params.genrich_ver}
-    Genrich \\
-        -t {input} \\
-        -o {output} \\
-        -j \\
-        -y \\
-        -r \\
-        -v \\
-        -d 150 \\
-        -m 5 \\
-        -e chrM,chrY
-    """
+        rname                           = "genrich",
+        genrich_ver                     = config['tools']['GENRICHVER']
+    shell: 
+        """
+        module load {params.genrich_ver}
+        Genrich \\
+            -t {input} \\
+            -o {output} \\
+            -j \\
+            -y \\
+            -r \\
+            -v \\
+            -d 150 \\
+            -m 5 \\
+            -e chrM,chrY
+        """
+
 
-# INDIVIDUAL RULES
 rule MACS2_narrow:
     input:
-        chip = lambda w: getMacChip(bam_dir, w.name, paired_end),
-        txt = lambda w: getMacTXT(ppqt_dir, w.name, paired_end),
-        c_option = lambda w: get_control_input(chip2input[w.name], paired_end, bam_dir),
+        chip                            = lambda w: getMacChip(bam_dir, w.name, paired_end),
+        txt                             = lambda w: getMacTXT(ppqt_dir, w.name, paired_end),
+        c_option                        = lambda w: get_control_input(chip2input[w.name], paired_end, bam_dir),
     output:
         join(macsN_dir, "{name}", "{name}_peaks.narrowPeak"),
     params:
-        rname='MACS2_narrow',
-        gsize=config['references'][genome]['EFFECTIVEGENOMESIZE'],
-        macsver=config['tools']['MACSVER'],
-        paired_end = paired_end,
-        flag= lambda w: "-c" if chip2input[w.name] else ""
-    shell: """
-    module load {params.macsver};
-    if [ '{params.paired_end}' == True ]; then
-        macs2 callpeak \\
-            -t {input.chip} {params.flag} {input.c_option} \\
-            -g {params.gsize} \\
-            -n {wildcards.name} \\
-            --outdir {macsN_dir}/{wildcards.name} \\
-            -q 0.01 \\
-            --keep-dup="all" \\
-            -f "BAMPE"
-    else
-        ppqt_len=$(awk '{{print $1}}' {input.txt})
-        macs2 callpeak \\
-            -t {input.chip} {params.flag} {input.c_option} \\
-            -g {params.gsize} \\
-            -n {wildcards.name} \\
-            --outdir {macsN_dir}/{wildcards.name} \\
-            -q 0.01 \\
-            --keep-dup="all" \\
-            --nomodel \\
-            --extsize $ppqt_len
-    fi
-    """
+        rname                           = 'MACS2_narrow',
+        gsize                           = config['references'][genome]['EFFECTIVEGENOMESIZE'],
+        macsver                         = config['tools']['MACSVER'],
+        flag                            = lambda w: "-c" if chip2input[w.name] else ""
+    shell: 
+        """
+        module load {params.macsver};
+        if [ '{params.paired_end}' == True ]; then
+            macs2 callpeak \\
+                -t {input.chip} {params.flag} {input.c_option} \\
+                -g {params.gsize} \\
+                -n {wildcards.name} \\
+                --outdir {macsN_dir}/{wildcards.name} \\
+                -q 0.01 \\
+                --keep-dup="all" \\
+                -f "BAMPE"
+        else
+            ppqt_len=$(awk '{{print $1}}' {input.txt})
+            macs2 callpeak \\
+                -t {input.chip} {params.flag} {input.c_option} \\
+                -g {params.gsize} \\
+                -n {wildcards.name} \\
+                --outdir {macsN_dir}/{wildcards.name} \\
+                -q 0.01 \\
+                --keep-dup="all" \\
+                --nomodel \\
+                --extsize $ppqt_len
+        fi
+        """
+
 
 rule MACS2_broad:
     input:
-        chip = lambda w: getMacChip(bam_dir, w.name, paired_end),
-        txt = lambda w: getMacTXT(ppqt_dir, w.name, paired_end),
-        c_option = lambda w: get_control_input(chip2input[w.name], paired_end, bam_dir),
+        chip                            = lambda w: getMacChip(bam_dir, w.name, paired_end),
+        txt                             = lambda w: getMacTXT(ppqt_dir, w.name, paired_end),
+        c_option                        = lambda w: get_control_input(chip2input[w.name], paired_end, bam_dir),
     output:
         join(macsB_dir, "{name}", "{name}_peaks.broadPeak"),
     params:
-        rname='MACS2_broad',
-        gsize=config['references'][genome]['EFFECTIVEGENOMESIZE'],
-        macsver=config['tools']['MACSVER'],
-        paired_end = paired_end,
-        flag= lambda w: "-c" if chip2input[w.name] else ""
-    shell: """
-    module load {params.macsver};
-    if [ '{params.paired_end}' == True ]; then
-        macs2 callpeak \\
-            -t {input.chip} {params.flag} {input.c_option} \\
-            -g {params.gsize} \\
-            -n {wildcards.name} \\
-            --outdir {macsB_dir}/{wildcards.name} \\
-            --broad \\
-            --broad-cutoff 0.01 \\
-            --keep-dup="all" \\
-            -f "BAMPE"
-    else 
-        ppqt_len=$(awk '{{print $1}}' {input.txt})
-        macs2 callpeak \\
-            -t {input.chip} {params.flag} {input.c_option} \\
-            -g {params.gsize} \\
-            -n {wildcards.name} \\
-            --outdir {macsB_dir}/{wildcards.name} \\
-            --broad \\
-            --broad-cutoff 0.01 \\
-            --keep-dup="all" \\
-            --nomodel \\
-            --extsize $ppqt_len
-    fi
-    """
+        rname                           = 'MACS2_broad',
+        gsize                           = config['references'][genome]['EFFECTIVEGENOMESIZE'],
+        macsver                         = config['tools']['MACSVER'],
+        flag                            = lambda w: "-c" if chip2input[w.name] else ""
+    shell: 
+        """
+        module load {params.macsver};
+        if [ '{params.paired_end}' == True ]; then
+            macs2 callpeak \\
+                -t {input.chip} {params.flag} {input.c_option} \\
+                -g {params.gsize} \\
+                -n {wildcards.name} \\
+                --outdir {macsB_dir}/{wildcards.name} \\
+                --broad \\
+                --broad-cutoff 0.01 \\
+                --keep-dup="all" \\
+                -f "BAMPE"
+        else 
+            ppqt_len=$(awk '{{print $1}}' {input.txt})
+            macs2 callpeak \\
+                -t {input.chip} {params.flag} {input.c_option} \\
+                -g {params.gsize} \\
+                -n {wildcards.name} \\
+                --outdir {macsB_dir}/{wildcards.name} \\
+                --broad \\
+                --broad-cutoff 0.01 \\
+                --keep-dup="all" \\
+                --nomodel \\
+                --extsize $ppqt_len
+        fi
+        """
+
 
 rule SICER:
-    input: 
-        chip = lambda w: getSicerChips(bam_dir, w.name, paired_end),
-        fragLen = lambda w: getSicerFragLen(ppqt_dir, qc_dir, w.name, paired_end),
-        c_option = lambda w: get_control_input(chip2input[w.name], paired_end, bam_dir),
+    input:
+        chip                            = lambda w: getSicerChips(bam_dir, w.name, paired_end),
+        fragLen                         = lambda w: getSicerFragLen(ppqt_dir, qc_dir, w.name, paired_end),
+        c_option                        = lambda w: get_control_input(chip2input[w.name], paired_end, bam_dir),
     output:
-        bed = join(sicer_dir, "{name}", "{name}_broadpeaks.bed"),
+        bed                             = join(sicer_dir, "{name}", "{name}_broadpeaks.bed"),
     params:
-        rname='SICER',
-        sicerver=config['tools']['SICERVER'],
-        bedtoolsver=config['tools']['BEDTOOLSVER'],
-        genomever = config['options']['genome'],
-        name="{name}",
-        sicer_dir=join(sicer_dir,"{name}"),
-        tmpdir=tmpdir,
-        paired_end = paired_end,
-        frac=config['references'][genome]['FRAC'],
-        flag= lambda w: "-c" if chip2input[w.name] else "",
-    shell: """
-    module load {params.sicerver}
-    module load {params.bedtoolsver}
-    if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi
-    tmp=$(mktemp -d -p "{params.tmpdir}")
-    trap 'rm -rf "${{tmp}}"' EXIT
+        rname                           = 'SICER',
+        name                            = "{name}",
+        sicerver                        = config['tools']['SICERVER'],
+        bedtoolsver                     = config['tools']['BEDTOOLSVER'],
+        genomever                       = config['options']['genome'],
+        this_sicer_dir                  = join(sicer_dir,"{name}"),
+        frac                            = config['references'][genome]['FRAC'],
+        flag                            = lambda w: "-c" if chip2input[w.name] else "",
+    shell: 
+        """
+        module load {params.sicerver}
+        module load {params.bedtoolsver}
+        if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi
+        tmp=$(mktemp -d -p "{params.tmpdir}")
+        trap 'rm -rf "${{tmp}}"' EXIT
 
-    if [ '{params.paired_end}' == True ]; then
-        MEAN_INSERT_SIZE=$(cat {input.fragLen} | awk '/MEDIAN_INSERT_SIZE/{{f=1;next}} /## HISTOGRAM/{{f=0}} f' | cut -f 6)
-        mean_insert_size=$(printf "%.0f" $MEAN_INSERT_SIZE)
-    else
-        mean_insert_size=$(awk '{{print $1}}' {input.fragLen})
-    fi
-    echo "printing out value of mean-insert-size ${{mean_insert_size}}"
-    a={input.c_option}
-    echo "Printing input.c_option ${{a}}"
-    if [ '{params.paired_end}' == True ]; then
-        if [ -f "{input.c_option}" ]; then
-            # Copying input to tmpdir due to SICER2
-            # bam2bed file conversion, if more than
-            # one sample shares the same IP sample
-            # than a race condition can occur where 
-            # two jobs can concurrent try to write 
-            # to the same BED file (during bedtools
-            # bam2bed that sicer calls).
-            input_bam="$(basename "{input.c_option}")"
-            cp {input.c_option} ${{tmp}}
-            echo "paired-end with input... ${{tmp}}/${{input_bam}}"
-            sicer \\
-            -t {input.chip} \\
-            -c "${{tmp}}/${{input_bam}}" \\
-            -s {params.genomever} \\
-            -rt 100 \\
-            -w 300 \\
-            -f ${{mean_insert_size}} \\
-            -egf {params.frac} \\
-            -g 600 \\
-            -fdr 1E-2 \\
-            -cpu 30 \\
-            -o ${{tmp}}
-            
-            mv ${{tmp}}/{params.name}.Q5DD-W300-G600-FDR0.01-island.bed {output.bed};
-            mv ${{tmp}}/{params.name}.Q5DD-W300-G600-islands-summary {params.sicer_dir}
+        if [ '{params.paired_end}' == True ]; then
+            MEAN_INSERT_SIZE=$(cat {input.fragLen} | awk '/MEDIAN_INSERT_SIZE/{{f=1;next}} /## HISTOGRAM/{{f=0}} f' | cut -f 6)
+            mean_insert_size=$(printf "%.0f" $MEAN_INSERT_SIZE)
         else
-            echo "paired-end without input"
-            sicer \\
-            -t {input.chip} \\
-            -s {params.genomever} \\
-            -rt 100 \\
-            -w 300 \\
-            -f ${{mean_insert_size}} \\
-            -egf {params.frac} \\
-            -g 600 \\
-            -e 100 \\
-            -cpu 30 \\
-            -o ${{tmp}}
-
-            mv ${{tmp}}/{params.name}.Q5DD-W300-G600.scoreisland {params.sicer_dir}
+            mean_insert_size=$(awk '{{print $1}}' {input.fragLen})
         fi
-    else
-        if [ -f "{input.c_option}" ]; then
-            echo "single-end with input"
-            cp {input.chip} ${{tmp}}/chip.bed.gz; gzip -d ${{tmp}}/chip.bed.gz;
-            awk 'BEGIN{{FS=OFS="\\t"}} {{gsub(/\./, 0, $5)}} 1' ${{tmp}}/chip.bed > ${{tmp}}/{params.name}.bed;
+        echo "printing out value of mean-insert-size ${{mean_insert_size}}"
+        a={input.c_option}
+        echo "Printing input.c_option ${{a}}"
+        if [ '{params.paired_end}' == True ]; then
+            if [ -f "{input.c_option}" ]; then
+                # Copying input to tmpdir due to SICER2
+                # bam2bed file conversion, if more than
+                # one sample shares the same IP sample
+                # than a race condition can occur where 
+                # two jobs can concurrent try to write 
+                # to the same BED file (during bedtools
+                # bam2bed that sicer calls).
+                input_bam="$(basename "{input.c_option}")"
+                cp {input.c_option} ${{tmp}}
+                echo "paired-end with input... ${{tmp}}/${{input_bam}}"
+                sicer \\
+                -t {input.chip} \\
+                -c "${{tmp}}/${{input_bam}}" \\
+                -s {params.genomever} \\
+                -rt 100 \\
+                -w 300 \\
+                -f ${{mean_insert_size}} \\
+                -egf {params.frac} \\
+                -g 600 \\
+                -fdr 1E-2 \\
+                -cpu 30 \\
+                -o ${{tmp}}
+                
+                mv ${{tmp}}/{params.name}.Q5DD-W300-G600-FDR0.01-island.bed {output.bed};
+                mv ${{tmp}}/{params.name}.Q5DD-W300-G600-islands-summary {params.this_sicer_dir}
+            else
+                echo "paired-end without input"
+                sicer \\
+                -t {input.chip} \\
+                -s {params.genomever} \\
+                -rt 100 \\
+                -w 300 \\
+                -f ${{mean_insert_size}} \\
+                -egf {params.frac} \\
+                -g 600 \\
+                -e 100 \\
+                -cpu 30 \\
+                -o ${{tmp}}
 
-            cp {input.c_option} ${{tmp}}/input.bed.gz; gzip -d ${{tmp}}/input.bed.gz;
-            awk 'BEGIN{{FS=OFS="\\t"}} {{gsub(/\./, 0, $5)}} 1' ${{tmp}}/input.bed > ${{tmp}}/inputV2.bed;
-            
-            sicer \\
-            -t ${{tmp}}/{params.name}.bed \\
-            -c ${{tmp}}/inputV2.bed \\
-            -s {params.genomever} \\
-            -rt 100 \\
-            -w 300 \\
-            -f ${{mean_insert_size}} \\
-            -egf {params.frac} \\
-            -g 600 \\
-            -fdr 1E-2 \\
-            -cpu 30 \\
-            -o ${{tmp}}
-            mv ${{tmp}}/{params.name}-W300-G600-FDR0.01-island.bed {output.bed};
-            mv ${{tmp}}/{params.name}-W300-G600-islands-summary {params.sicer_dir}
+                mv ${{tmp}}/{params.name}.Q5DD-W300-G600.scoreisland {params.this_sicer_dir}
+            fi
         else
-            echo "single-end without input"
-            cp {input.chip} ${{tmp}}/chip.bed.gz; gzip -d ${{tmp}}/chip.bed.gz;
-            awk 'BEGIN{{FS=OFS="\\t"}} {{gsub(/\./, 0, $5)}} 1' ${{tmp}}/chip.bed > ${{tmp}}/{params.name}.bed;
-            sicer \\
-            -t ${{tmp}}/{params.name}.bed \\
-            -s {params.genomever} \\
-            -rt 100 \\
-            -w 300 \\
-            -f ${{mean_insert_size}} \\
-            -egf {params.frac} \\
-            -g 600 \\
-            -e 100 \\
-            -cpu 30 \\
-            -o ${{tmp}}
-            mv ${{tmp}}/{params.name}-W300-G600.scoreisland {output.bed}
+            if [ -f "{input.c_option}" ]; then
+                echo "single-end with input"
+                cp {input.chip} ${{tmp}}/chip.bed.gz; gzip -d ${{tmp}}/chip.bed.gz;
+                awk 'BEGIN{{FS=OFS="\\t"}} {{gsub(/\./, 0, $5)}} 1' ${{tmp}}/chip.bed > ${{tmp}}/{params.name}.bed;
+
+                cp {input.c_option} ${{tmp}}/input.bed.gz; gzip -d ${{tmp}}/input.bed.gz;
+                awk 'BEGIN{{FS=OFS="\\t"}} {{gsub(/\./, 0, $5)}} 1' ${{tmp}}/input.bed > ${{tmp}}/inputV2.bed;
+                
+                sicer \\
+                -t ${{tmp}}/{params.name}.bed \\
+                -c ${{tmp}}/inputV2.bed \\
+                -s {params.genomever} \\
+                -rt 100 \\
+                -w 300 \\
+                -f ${{mean_insert_size}} \\
+                -egf {params.frac} \\
+                -g 600 \\
+                -fdr 1E-2 \\
+                -cpu 30 \\
+                -o ${{tmp}}
+                mv ${{tmp}}/{params.name}-W300-G600-FDR0.01-island.bed {output.bed};
+                mv ${{tmp}}/{params.name}-W300-G600-islands-summary {params.this_sicer_dir}
+            else
+                echo "single-end without input"
+                cp {input.chip} ${{tmp}}/chip.bed.gz; gzip -d ${{tmp}}/chip.bed.gz;
+                awk 'BEGIN{{FS=OFS="\\t"}} {{gsub(/\./, 0, $5)}} 1' ${{tmp}}/chip.bed > ${{tmp}}/{params.name}.bed;
+                sicer \\
+                -t ${{tmp}}/{params.name}.bed \\
+                -s {params.genomever} \\
+                -rt 100 \\
+                -w 300 \\
+                -f ${{mean_insert_size}} \\
+                -egf {params.frac} \\
+                -g 600 \\
+                -e 100 \\
+                -cpu 30 \\
+                -o ${{tmp}}
+                mv ${{tmp}}/{params.name}-W300-G600.scoreisland {output.bed}
+            fi
         fi
-    fi
-    """
+        """
+
 
 rule MEME:
     input:
-        bed = lambda w: join(workpath, w.PeakTool, w.name, w.name + PeakExtensions[w.PeakTool])
+        bed                             = lambda w: join(workpath, w.PeakTool, w.name, w.name + PeakExtensions[w.PeakTool])
     output:
-        meme_out = join(MEME_dir, "{PeakTool}", "{name}_meme", "meme-chip.html"),
-        ame_out = join(MEME_dir, "{PeakTool}", "{name}_ame", "ame.html")
+        meme_out                        = join(MEME_dir, "{PeakTool}", "{name}_meme", "meme-chip.html"),
+        ame_out                         = join(MEME_dir, "{PeakTool}", "{name}_ame", "ame.html")
     params:
-        rname='MEME',
-        ref_fa=config['references'][genome]['GENOME'],
-        meme_vertebrates_db=config['references'][genome]['MEME_VERTEBRATES_DB'],
-        meme_euk_db=config['references'][genome]['MEME_EUKARYOTE_DB'],
-        meme_genome_db=config['references'][genome]['MEME_GENOME_DB'],
-        oc=join(MEME_dir, "{PeakTool}", "{name}"),
-        tmpdir=tmpdir,
-        outfa="{name}.fa",
-        ntasks=int(28)
-    shell: """
-    module load meme
-    module load bedtools
-    if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi
-    tmp=$(mktemp -d -p "{params.tmpdir}")
-    trap 'rm -rf "${{tmp}}"' EXIT
+        rname                           = 'SICER',
+        rname                           = 'MEME',
+        ref_fa                          = config['references'][genome]['GENOME'],
+        meme_vertebrates_db             = config['references'][genome]['MEME_VERTEBRATES_DB'],
+        meme_euk_db                     = config['references'][genome]['MEME_EUKARYOTE_DB'],
+        meme_genome_db                  = config['references'][genome]['MEME_GENOME_DB'],
+        oc                              = join(MEME_dir, "{PeakTool}", "{name}"),
+        outfa                           = "{name}.fa",
+        ntasks                          = int(28)
+    shell: 
+        """
+        module load meme
+        module load bedtools
+        if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi
+        tmp=$(mktemp -d -p "{params.tmpdir}")
+        trap 'rm -rf "${{tmp}}"' EXIT
 
-    bedtools getfasta -fi {params.ref_fa} -bed {input.bed} -fo ${{tmp}}/{params.outfa}
-    meme-chip \\
-      --oc {params.oc}_meme \\
-      -db {params.meme_vertebrates_db} \\
-      -meme-searchsize 34000000 \\
-      -meme-p {params.ntasks} \\
-      ${{tmp}}/{params.outfa}
+        bedtools getfasta -fi {params.ref_fa} -bed {input.bed} -fo ${{tmp}}/{params.outfa}
+        meme-chip \\
+        --oc {params.oc}_meme \\
+        -db {params.meme_vertebrates_db} \\
+        -meme-searchsize 34000000 \\
+        -meme-p {params.ntasks} \\
+        ${{tmp}}/{params.outfa}
 
-    ame \\
-    --oc {params.oc}_ame ${{tmp}}/{params.outfa} \\
-    {params.meme_euk_db} {params.meme_vertebrates_db} {params.meme_genome_db}
-    """
\ No newline at end of file
+        ame \\
+        --oc {params.oc}_ame ${{tmp}}/{params.outfa} \\
+        {params.meme_euk_db} {params.meme_vertebrates_db} {params.meme_genome_db}
+        """
\ No newline at end of file
diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk
index 26861ef..48343f9 100644
--- a/workflow/rules/qc.smk
+++ b/workflow/rules/qc.smk
@@ -14,6 +14,7 @@ paired_end                      = False if config['project']['nends'] == 1 else
 samples                         = config['samples']
 ends                            = [1] if not paired_end else [1, 2]
  
+
 # ~~ directories
 qc_dir                          = join(workpath, "QC")
 kraken_dir                      = join(workpath, 'kraken')
@@ -157,31 +158,32 @@ rule fastqc:
         config['tools']['FASTQCVER']
     threads:
         int(allocated("threads", "fastqc", cluster))
-    shell: """
-    # Setups temporary directory for
-    # intermediate files with built-in 
-    # mechanism for deletion on exit
-    if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi
-    tmp=$(mktemp -d -p "{params.tmpdir}")
-    trap 'rm -rf "${{tmp}}"' EXIT
+    shell: 
+        """
+        # Setups temporary directory for
+        # intermediate files with built-in 
+        # mechanism for deletion on exit
+        if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi
+        tmp=$(mktemp -d -p "{params.tmpdir}")
+        trap 'rm -rf "${{tmp}}"' EXIT
 
-    # Running fastqc with local
-    # disk or a tmpdir, fastqc
-    # has been observed to lock
-    # up gpfs filesystems, adding
-    # this on request by HPC staff
-    fastqc \\
-        {input} \\
-        -t {threads} \\
-        -o "${{tmp}}"
-    
-    # Copy output files from tmpdir
-    # to output directory
-    find "${{tmp}}" \\
-        -type f \\
-        \\( -name '*.html' -o -name '*.zip' \\) \\
-        -exec cp {{}} {params.outdir} \\;
-    """
+        # Running fastqc with local
+        # disk or a tmpdir, fastqc
+        # has been observed to lock
+        # up gpfs filesystems, adding
+        # this on request by HPC staff
+        fastqc \\
+            {input} \\
+            -t {threads} \\
+            -o "${{tmp}}"
+        
+        # Copy output files from tmpdir
+        # to output directory
+        find "${{tmp}}" \\
+            -type f \\
+            \\( -name '*.html' -o -name '*.zip' \\) \\
+            -exec cp {{}} {params.outdir} \\;
+        """
 
 rule fastq_screen:
     """
@@ -232,6 +234,7 @@ rule fastq_screen:
             {input}
         """
 
+
 rule kraken:
     """
     Quality-control step to assess for potential sources of microbial contamination.
@@ -293,6 +296,7 @@ rule kraken:
             ktImportTaxonomy - -o {output.kronahtml}
         """
 
+
 rule multiqc:
     """
     Reporting step to aggregate sample statistics and quality-control information
@@ -321,16 +325,17 @@ rule multiqc:
         multiqc                 = config['tools']['MULTIQCVER'],
 	    qcconfig                = join(workpath, config['shared_resources']['MULTIQC_CONFIG']),
 	    excludedir              = join(workpath, extra_fingerprint_dir),
-    shell: """
-    module load {params.multiqc}
-    multiqc \\
-        -f \\
-        -c {params.qcconfig} \\
-        --interactive \\
-        -e cutadapt \\
-        --ignore {params.excludedir} \\
-        -d """ + workpath + """
-    """
+    shell: 
+        """
+        module load {params.multiqc}
+        multiqc \\
+            -f \\
+            -c {params.qcconfig} \\
+            --interactive \\
+            -e cutadapt \\
+            --ignore {params.excludedir} \\
+            -d """ + workpath + """
+        """
 
 
 rule insert_size:
@@ -362,9 +367,11 @@ rule insert_size:
             -H {output.pdf}
         """
 
+
 rule deeptools_QC:
     input:
-        [ join(workpath, bw_dir, name + ".Q5DD.RPGC.bw") for name in samples ] # this should be all bigwigs
+        # this should be all bigwigs
+        [ join(workpath, bw_dir, name + ".Q5DD.RPGC.bw") for name in samples ] 
     output:
         javaram                 = '16g',
         heatmap                 = join(deeptools_dir, "spearman_heatmap.Q5DD.pdf"),
@@ -374,7 +381,8 @@ rule deeptools_QC:
     params:
         rname                   = "deeptools_QC",
         deeptoolsver            = config['tools']['DEEPTOOLSVER'],
-        labels=samples # this should be the sample names to match the bigwigs in the same order
+        # this should be the sample names to match the bigwigs in the same order
+        labels                  = samples 
     shell: 
         """    
         module load {params.deeptoolsver}
@@ -384,10 +392,11 @@ rule deeptools_QC:
         plotPCA -in {output.npz} -o {output.pca}
         """
 
+
 rule FRiP:
     input:
-        bed = lambda w: [ join(workpath, w.PeakTool, chip, chip + PeakExtensions[w.PeakTool]) for chip in chips ],
-        bam = join(bam_dir, "{name}.Q5DD.bam"),
+        bed                     = lambda w: [ join(workpath, w.PeakTool, chip, chip + PeakExtensions[w.PeakTool]) for chip in chips ],
+        bam                     = join(bam_dir, "{name}.Q5DD.bam"),
     output:
         join(workpath,"PeakQC","{PeakTool}.{name}.Q5DD.FRiP_table.txt"),
     params:
@@ -414,6 +423,7 @@ rule FRiP:
             -o {params.outroot}
         """
 
+
 rule jaccard:
     input:
         lambda w: [ join(workpath, w.PeakTool, chip, chip + PeakExtensions[w.PeakTool]) for chip in chips ],
@@ -432,4 +442,4 @@ rule jaccard:
             -i "{input}" \\
             -o "{params.outroot}" \\
             -g {params.genome}
-        """
+        """
\ No newline at end of file
diff --git a/workflow/rules/trim_align_dedup.smk b/workflow/rules/trim_align_dedup.smk
index 8eae5cd..1fe15a8 100644
--- a/workflow/rules/trim_align_dedup.smk
+++ b/workflow/rules/trim_align_dedup.smk
@@ -187,7 +187,8 @@ rule BWA:
         idxstat1                            = join(bam_dir, "{name}.sorted.bam.idxstat"),
         flagstat2                           = join(bam_dir, "{name}.Q5.bam.flagstat"),
         idxstat2                            = join(bam_dir, "{name}.Q5.bam.idxstat"),
-    threads: 32
+    threads: 
+        32
     shell: 
         """
         module load {params.bwaver};
@@ -219,6 +220,7 @@ rule BWA:
         fi
         """
 
+
 rule dedup:
     """
     Picard MarkDuplicates removes duplicates from bam file.
@@ -311,6 +313,7 @@ rule dedup:
         fi
         """
 
+
 rule ppqt:
     input:
         bam                                 = lambda w : join(bam_dir, w.name + "." + w.ext + "." + get_bam_ext(w.ext, paired_end))
@@ -325,7 +328,8 @@ rule ppqt:
         scriptPy                            = join(workpath, "bin", "ppqt_process.py"),
         tmpdir                              = tmpdir,
         file_name                           = "{name}"
-    container: config['images']['ppqt']
+    container: 
+        config['images']['ppqt']
     shell: 
         """
         if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi
@@ -355,6 +359,7 @@ rule ppqt:
         python {params.scriptPy} -i {output.ppqt} -o {output.txt}
         """
 
+
 rule bam2bw:
     """
     bamCoverage converts bams to bigwig files for read visialization
@@ -419,7 +424,7 @@ rule inputnorm:
        bigWig file of treatmment sample normalizes with its input control
     """
     input:
-        bws = lambda w: ctrl_test(chip2input, w.name, bw_dir)
+        bws                                 = lambda w: ctrl_test(chip2input, w.name, bw_dir)
     output:
         join(bw_dir, "{name}.Q5DD.RPGC.inputnorm.bw")
     params:
@@ -441,4 +446,4 @@ rule inputnorm:
             --operation 'subtract' \\
             --skipNonCoveredRegions \\
             -p {threads}
-        """
+        """
\ No newline at end of file
diff --git a/workflow/scripts/peakcall.py b/workflow/scripts/peakcall.py
index e3b07e1..fb03a3c 100644
--- a/workflow/scripts/peakcall.py
+++ b/workflow/scripts/peakcall.py
@@ -102,4 +102,11 @@ def getSicerFragLen(ppqt_dir, qc_dir, name, paired_end):
         fragLen = join(qc_dir, name + ".Q5DD.insert_size_metrics.txt")
     else:
         fragLen = join(ppqt_dir, name + ".Q5DD_tagAlign.ppqt.txt")
-    return fragLen
\ No newline at end of file
+    return fragLen
+
+
+def get_manorm_sizes(g1, g2, group_data, ppqt_in):
+    file = lambda w, _in: list(map(lambda z: z.strip().split(), open(ppqt_in, 'r').readlines()))
+    extsize1 = [ppqt[1] for ppqt in file if ppqt[0] == group_data[g1]][0]
+    extsize2 = [ppqt[1] for ppqt in file if ppqt[0] == group_data[g2]][0]
+    return f"--s1 {extsize1} --s2 {extsize2}"
\ No newline at end of file

From 65e3e3c2abd5f9ac069affb5dd1022f6b169d218 Mon Sep 17 00:00:00 2001
From: Ryan Routsong <routsongrm@nih.gov>
Date: Thu, 11 Jul 2024 14:05:17 -0400
Subject: [PATCH 12/28] fix: reference global tmpdir and paired_end flag

---
 workflow/rules/dba.smk      |  2 +-
 workflow/rules/peakcall.smk | 17 ++++++++---------
 2 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/workflow/rules/dba.smk b/workflow/rules/dba.smk
index 80265a4..7878039 100644
--- a/workflow/rules/dba.smk
+++ b/workflow/rules/dba.smk
@@ -255,7 +255,7 @@ rule manorm:
         rname                           = 'manorm',
         fldr                            = join(manorm_dir, "{group1}_vs_{group2}-{tool}"),
         bedtoolsver                     = config['tools']['BEDTOOLSVER'],
-        manormver                       = "manorm/1.1.4"
+        manormver                       = "manorm/1.1.4",
         extsizes                        = lambda w, _in: get_manorm_sizes(w.group1, w.group2, groupdata, _in.ppqt)
     shell:
         """
diff --git a/workflow/rules/peakcall.smk b/workflow/rules/peakcall.smk
index f7953ee..e81f5a4 100644
--- a/workflow/rules/peakcall.smk
+++ b/workflow/rules/peakcall.smk
@@ -102,7 +102,7 @@ rule MACS2_narrow:
     shell: 
         """
         module load {params.macsver};
-        if [ '{params.paired_end}' == True ]; then
+        if [ '""" + str(paired_end) + """' == True ]; then
             macs2 callpeak \\
                 -t {input.chip} {params.flag} {input.c_option} \\
                 -g {params.gsize} \\
@@ -141,7 +141,7 @@ rule MACS2_broad:
     shell: 
         """
         module load {params.macsver};
-        if [ '{params.paired_end}' == True ]; then
+        if [ '""" + str(paired_end) + """' == True ]; then
             macs2 callpeak \\
                 -t {input.chip} {params.flag} {input.c_option} \\
                 -g {params.gsize} \\
@@ -187,11 +187,11 @@ rule SICER:
         """
         module load {params.sicerver}
         module load {params.bedtoolsver}
-        if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi
-        tmp=$(mktemp -d -p "{params.tmpdir}")
+        if [ ! -d \"""" + str(tmpdir) + """\" ]; then mkdir -p \"""" + str(tmpdir) + """\"; fi
+        tmp=$(mktemp -d -p \"""" + str(tmpdir) + """\")
         trap 'rm -rf "${{tmp}}"' EXIT
 
-        if [ '{params.paired_end}' == True ]; then
+        if [ '""" + str(paired_end) + """' == True ]; then
             MEAN_INSERT_SIZE=$(cat {input.fragLen} | awk '/MEDIAN_INSERT_SIZE/{{f=1;next}} /## HISTOGRAM/{{f=0}} f' | cut -f 6)
             mean_insert_size=$(printf "%.0f" $MEAN_INSERT_SIZE)
         else
@@ -200,7 +200,7 @@ rule SICER:
         echo "printing out value of mean-insert-size ${{mean_insert_size}}"
         a={input.c_option}
         echo "Printing input.c_option ${{a}}"
-        if [ '{params.paired_end}' == True ]; then
+        if [ '""" + str(paired_end) + """' == True ]; then
             if [ -f "{input.c_option}" ]; then
                 # Copying input to tmpdir due to SICER2
                 # bam2bed file conversion, if more than
@@ -294,7 +294,6 @@ rule MEME:
         meme_out                        = join(MEME_dir, "{PeakTool}", "{name}_meme", "meme-chip.html"),
         ame_out                         = join(MEME_dir, "{PeakTool}", "{name}_ame", "ame.html")
     params:
-        rname                           = 'SICER',
         rname                           = 'MEME',
         ref_fa                          = config['references'][genome]['GENOME'],
         meme_vertebrates_db             = config['references'][genome]['MEME_VERTEBRATES_DB'],
@@ -307,8 +306,8 @@ rule MEME:
         """
         module load meme
         module load bedtools
-        if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi
-        tmp=$(mktemp -d -p "{params.tmpdir}")
+        if [ ! -d \"""" + str(tmpdir) + """\" ]; then mkdir -p \"""" + str(tmpdir) + """\"; fi
+        tmp=$(mktemp -d -p \"""" + str(tmpdir) + """\")
         trap 'rm -rf "${{tmp}}"' EXIT
 
         bedtools getfasta -fi {params.ref_fa} -bed {input.bed} -fo ${{tmp}}/{params.outfa}

From f94688691ea060500b63139d8a87d2d86443f1b7 Mon Sep 17 00:00:00 2001
From: Ryan Routsong <routsongrm@nih.gov>
Date: Thu, 11 Jul 2024 14:40:49 -0400
Subject: [PATCH 13/28] fix: fix rep switch

---
 workflow/Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/Snakefile b/workflow/Snakefile
index e025c45..6f0e4e7 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -30,7 +30,7 @@ zipGroup1, zipGroup2, zipToolC, contrasts \
                                 = zip_contrasts(contrast, PeakTools)
 file_stems, extRPGC, extaln     = get_file_components(paired_end)
 groups                          = list(groupdatawinput.keys())
-reps                            = False if len(groupswreps) > 0 else True
+reps                            = True if len(groupswreps) > 0 else False
 uniq_inputs                     = list(sorted(set([v for v in chip2input.values() if v])))
 sampleswinput                   = [
     chip_value for input_id, chip_value in chip2input.items() \

From af5e79149a8ce71fe2d961b80455b64506e1a751 Mon Sep 17 00:00:00 2001
From: Ryan Routsong <routsongrm@nih.gov>
Date: Thu, 11 Jul 2024 15:42:17 -0400
Subject: [PATCH 14/28] chore: more spacing, fix reps flag reversal

---
 workflow/Snakefile                  | 106 ++++++++++++++++-----------
 workflow/rules/dba.smk              |  12 ++--
 workflow/rules/trim_align_dedup.smk | 108 +++++++++++++---------------
 3 files changed, 123 insertions(+), 103 deletions(-)

diff --git a/workflow/Snakefile b/workflow/Snakefile
index 6f0e4e7..4b9c2c7 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -38,6 +38,7 @@ sampleswinput                   = [
 ]
 inputnorm                       = [""] if len(sampleswinput) == 0 else ["", ".inputnorm"]
 deepgroups, deepexts            = group_output_files(extRPGC, groups, inputnorm)
+UropaCats                       = ["protTSS", "prot", "protSEC", "genes"]
 
 # Directory end points
 bam_dir                         = join(workpath, "bam")
@@ -54,6 +55,7 @@ diffbind_dir                    = join(workpath, "DiffBind")
 cfTool_dir                      = join(workpath, "cfChIPtool")
 genrich_dir                     = join(workpath, "Genrich")
 MEME_dir                        = join(workpath, "MEME")
+manorm_dir                      = join(workpath, "MANorm")
 
 # Read in resource information
 with open(join('config', 'cluster.json')) as fh:
@@ -87,8 +89,6 @@ if assay == "cfchip":
         join(uropa_dir, "promoterTable1", "{PeakTool}_promoter_overlap_summaryTable.txt"), 
         PeakTool=PeakTools
     ))
-
-
     if reps:
         rule_all_ins.extend(expand(
             join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind.html"), 
@@ -100,47 +100,73 @@ if assay == "cfchip":
             name=contrasts,
              _type=["protTSS"]
         ))
-    elif assay in ["atac", "chip"]:
-        peak_types.extend(["prot", "protSEC", "genes"])
-        rule_all_ins.extend(expand(join(MEME_dir, "{PeakTool}", "{name}_meme", "meme-chip.html"), PeakTool=PeakTools, name=chips))
-        rule_all_ins.extend(expand(join(MEME_dir, "{PeakTool}", "{name}_ame", "ame.html"), PeakTool=PeakTools, name=chips))
+    # else:
+    #     rule_all_ins.extend(expand(
+    #         join(uropa_dir, '{PeakTool}', '{name}_{PeakTool}_uropa_{_type}_allhits.txt'), 
+    #         PeakTool="MANorm", 
+    #         name=contrasts, 
+    #         _type=UropaCats
+    #     ))
+    #     rule_all_ins.extend(expand(
+    #         join(manorm_dir, "{group1}_vs_{group2}-{tool}", "{group1}_vs_{group2}-{tool}_all_MAvalues.xls"), 
+    #         group1=zipGroup1, 
+    #         group2=zipGroup2, 
+    #         tool=zipToolC
+    #     ))
+elif assay in ["atac", "chip"]:
+    peak_types.extend(["prot", "protSEC", "genes"])
+    rule_all_ins.extend(expand(join(MEME_dir, "{PeakTool}", "{name}_meme", "meme-chip.html"), PeakTool=PeakTools, name=chips))
+    rule_all_ins.extend(expand(join(MEME_dir, "{PeakTool}", "{name}_ame", "ame.html"), PeakTool=PeakTools, name=chips))
+    if paired_end:
+        rule_all_ins.extend(expand(join(qc_dir, "{name}.{stem}.insert_size_metrics.txt"), name=samples, stem=file_stems))
+    if assay == "chip":
+        rule_all_ins.extend(expand(join(macsB_dir, "{name}", "{name}_peaks.broadPeak"), name=chips))
+        rule_all_ins.extend(expand(join(sicer_dir, "{name}", "{name}_broadpeaks.bed"), name=chips))
         if paired_end:
-            rule_all_ins.extend(expand(join(qc_dir, "{name}.{stem}.insert_size_metrics.txt"), name=samples, stem=file_stems))
-        if assay == "chip":
-            rule_all_ins.extend(expand(join(macsB_dir, "{name}", "{name}_peaks.broadPeak"), name=chips))
-            rule_all_ins.extend(expand(join(sicer_dir, "{name}", "{name}_broadpeaks.bed"), name=chips))
-            if paired_end:
-                short_ext, tag_ext = ["sorted", "Q5DD"], ["sorted", "Q5DD_tagAlign"]
-                rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.ppqt"), name=samples, ext=short_ext))
-                rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.pdf"), name=samples, ext=short_ext))
-                rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.ppqt.txt"), name=samples, ext=short_ext))
-                rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.ppqt"), name=samples, ext=tag_ext))
-                rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.pdf"), name=samples, ext=tag_ext))
-                rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.ppqt.txt"), name=samples, ext=tag_ext))
-        elif assay == "atac":
-            rule_all_ins.extend(expand(
-                join(genrich_dir, "{name}", "{name}.narrowPeak"), name=chips
-            ))
-        if reps:
-            rule_all_ins.extend(expand(
-                join(uropa_dir, "{PeakTool}", "{name}_{PeakTool}_uropa_{_type}_allhits.txt"),
-                PeakTool=PeakTools, name=chips, _type=peak_types
-            ))
-            rule_all_ins.extend(expand(
-                join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind.html"), 
-                group1=zipGroup1, group2=zipGroup2, PeakTool=zipToolC
-            ))
+            short_ext, tag_ext = ["sorted", "Q5DD"], ["sorted", "Q5DD_tagAlign"]
+            rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.ppqt"), name=samples, ext=short_ext))
+            rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.pdf"), name=samples, ext=short_ext))
+            rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.ppqt.txt"), name=samples, ext=short_ext))
+            rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.ppqt"), name=samples, ext=tag_ext))
+            rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.pdf"), name=samples, ext=tag_ext))
+            rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.ppqt.txt"), name=samples, ext=tag_ext))
+    elif assay == "atac":
+        rule_all_ins.extend(expand(
+            join(genrich_dir, "{name}", "{name}.narrowPeak"), name=chips
+        ))
+    if reps:
+        rule_all_ins.extend(expand(
+            join(uropa_dir, "{PeakTool}", "{name}_{PeakTool}_uropa_{_type}_allhits.txt"),
+            PeakTool=PeakTools, name=chips, _type=peak_types
+        ))
+        rule_all_ins.extend(expand(
+            join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind.html"), 
+            group1=zipGroup1, group2=zipGroup2, PeakTool=zipToolC
+        ))
+        rule_all_ins.extend(expand(
+            join(uropa_dir, "{name}_{PeakTool}_uropa_{_type}_allhits.txt"), 
+            PeakTool=["DiffbindEdgeR", "DiffbindDeseq2"], 
+            name=contrasts, 
+            _type=peak_types
+        ))
+        if contrast:
             rule_all_ins.extend(expand(
-                join(uropa_dir, "{name}_{PeakTool}_uropa_{_type}_allhits.txt"), 
-                PeakTool=["DiffbindEdgeR", "DiffbindDeseq2"], 
-                name=contrasts, 
-                _type=peak_types
+                join(uropa_dir, "promoterTable2", "DiffbindDeseq2_{PeakTool}_promoter_overlap_summaryTable.txt"), 
+                PeakTool=PeakTools
             ))
-            if contrast:
-                rule_all_ins.extend(expand(
-                    join(uropa_dir, "promoterTable2", "DiffbindDeseq2_{PeakTool}_promoter_overlap_summaryTable.txt"), 
-                    PeakTool=PeakTools
-                ))
+    # else:
+    #     rule_all_ins.extend(expand(
+    #         join(uropa_dir, '{PeakTool}', '{name}_{PeakTool}_uropa_{_type}_allhits.txt'), 
+    #         PeakTool="MANorm", 
+    #         name=contrasts, 
+    #         _type=UropaCats
+    #     ))
+    #     rule_all_ins.extend(expand(
+    #         join(manorm_dir, "{group1}_vs_{group2}-{tool}", "{group1}_vs_{group2}-{tool}_all_MAvalues.xls"), 
+    #         group1=zipGroup1, 
+    #         group2=zipGroup2, 
+    #         tool=zipToolC
+    #     ))
 
 rule_all_ins.append(join(workpath,"multiqc_report.html"))
 rule_all_ins.extend(expand(join(qc_dir, "{name}.preseq.dat"), name=samples))
diff --git a/workflow/rules/dba.smk b/workflow/rules/dba.smk
index 7878039..42f811e 100644
--- a/workflow/rules/dba.smk
+++ b/workflow/rules/dba.smk
@@ -242,15 +242,15 @@ rule manorm:
     input:
         bam1                            = lambda w: join(bam_dir, groupdata[w.group1][0] + ".Q5DD.bam"),
         bam2                            = lambda w: join(bam_dir, groupdata[w.group2][0] + ".Q5DD.bam"),
-        ppqt                            = join(bam_dir, "Q5DD.ppqt.txt"),
+        ppqt                            = join(ppqt_dir, "Q5DD.ppqt.txt"),
         peak1                           = lambda w: join(workpath, w.tool, groupdata[w.group1][0], groupdata[w.group1][0] + PeakExtensions[w.tool]),
         peak2                           = lambda w: join(workpath, w.tool, groupdata[w.group2][0], groupdata[w.group2][0] + PeakExtensions[w.tool]),
     output:
-        xls                             = join(manorm_dir, "{group1}_vs_{group2}-{tool}","{group1}_vs_{group2}-{tool}_all_MAvalues.xls"),
-        bed                             = temp(join(manorm_dir, "{group1}_vs_{group2}-{tool}","{group1}_vs_{group2}-{tool}_all_MA.bed")),
-        wigA                            = join(manorm_dir, "{group1}_vs_{group2}-{tool}","output_tracks","{group1}_vs_{group2}_A_values.wig.gz"),
-        wigM                            = join(manorm_dir, "{group1}_vs_{group2}-{tool}","output_tracks","{group1}_vs_{group2}_M_values.wig.gz"),
-        wigP                            = join(manorm_dir, "{group1}_vs_{group2}-{tool}","output_tracks","{group1}_vs_{group2}_P_values.wig.gz"),
+        xls                             = join(manorm_dir, "{group1}_vs_{group2}-{tool}", "{group1}_vs_{group2}-{tool}_all_MAvalues.xls"),
+        bed                             = temp(join(manorm_dir, "{group1}_vs_{group2}-{tool}", "{group1}_vs_{group2}-{tool}_all_MA.bed")),
+        wigA                            = join(manorm_dir, "{group1}_vs_{group2}-{tool}", "output_tracks", "{group1}_vs_{group2}_A_values.wig.gz"),
+        wigM                            = join(manorm_dir, "{group1}_vs_{group2}-{tool}", "output_tracks", "{group1}_vs_{group2}_M_values.wig.gz"),
+        wigP                            = join(manorm_dir, "{group1}_vs_{group2}-{tool}", "output_tracks", "{group1}_vs_{group2}_P_values.wig.gz"),
     params:
         rname                           = 'manorm',
         fldr                            = join(manorm_dir, "{group1}_vs_{group2}-{tool}"),
diff --git a/workflow/rules/trim_align_dedup.smk b/workflow/rules/trim_align_dedup.smk
index 1fe15a8..5665956 100644
--- a/workflow/rules/trim_align_dedup.smk
+++ b/workflow/rules/trim_align_dedup.smk
@@ -60,8 +60,6 @@ rule trim:
         trailingquality                     = 10,
         javaram                             = "64g",
         sample                              = "{name}",
-        tmpdir                              = tmpdir,
-        paired_end                          = paired_end
     threads: 
         16
     shell: 
@@ -70,11 +68,11 @@ rule trim:
         module load {params.bwaver};
         module load {params.samtoolsver};
         module load {params.picardver};
-        if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi
-        tmp=$(mktemp -d -p "{params.tmpdir}")
+        if [ ! -d \"""" + tmpdir + """\" ]; then mkdir -p \"""" + tmpdir + """\"; fi
+        tmp=$(mktemp -d -p \"""" + tmpdir + """\")
         trap 'rm -rf "${{tmp}}"' EXIT
 
-        if [ '{params.paired_end}' == True ]; then
+        if [ \"""" + str(paired_end) + """\" == True ]; then
             cutadapt \\
                 --pair-filter=any \\
                 --nextseq-trim=2 \\
@@ -241,11 +239,11 @@ rule dedup:
     input:
         bam2                                = join(bam_dir,"{name}.Q5.bam")
     output:
-        out5                                = join(workpath,bam_dir,"{name}.Q5DD.bam"),
-        out5f                               = join(workpath,bam_dir,"{name}.Q5DD.bam.flagstat"),
-        out5i                               = join(workpath,bam_dir,"{name}.Q5DD.bam.idxstat"),
-        out6                                = provided(join(workpath,bam_dir,"{name}.bwa.Q5.duplic"), paired_end),
-        out7                                = dedup_out7(join(workpath,bam_dir,"{name}"), assay, paired_end)
+        out5                                = join(bam_dir, "{name}.Q5DD.bam"),
+        out5f                               = join(bam_dir, "{name}.Q5DD.bam.flagstat"),
+        out5i                               = join(bam_dir, "{name}.Q5DD.bam.idxstat"),
+        out6                                = provided(join(bam_dir, "{name}.bwa.Q5.duplic"), paired_end),
+        out7                                = dedup_out7(join(bam_dir, "{name}"), assay, paired_end)
     params:
         rname                               = 'dedup',
         picardver                           = config['tools']['PICARDVER'],
@@ -253,7 +251,6 @@ rule dedup:
         bedtoolsver                         = config['tools']['BEDTOOLSVER'],
         macsver                             = config['tools']['MACSVER'],
         gsize                               = config['references'][genome]['EFFECTIVEGENOMESIZE'],
-        folder                              = join(workpath,bam_dir),
         genomefile                          = config['references'][genome]['REFLEN'],
         rver                                = config['tools']['RVER'],
         javaram                             = '16g',
@@ -267,49 +264,49 @@ rule dedup:
         module load {params.bedtoolsver};
         module load {params.macsver};
         module load {params.rver}; 
-        if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi
-        tmp=$(mktemp -d -p "{params.tmpdir}")
+        if [ ! -d \"""" + tmpdir + """\" ]; then mkdir -p \"""" + tmpdir + """\"; fi
+        tmp=$(mktemp -d -p \"""" + tmpdir + """\")
         trap 'rm -rf "${{tmp}}"' EXIT
         
         if [ "{assay}" == "cfchip" ];then
-        java -Xmx{params.javaram} \\
-            -jar $PICARDJARPATH/picard.jar MarkDuplicates \\
-            -I {input.bam2} \\
-            -O {params.tmpBam} \\
-            -TMP_DIR ${{tmp}} \\
-            -VALIDATION_STRINGENCY SILENT \\
-            -REMOVE_DUPLICATES true \\
-            -METRICS_FILE {output.out6};
-        samtools index {params.tmpBam};
-        samtools view -b {params.tmpBam} chr{{1..22}} > {output.out5};
-        Rscript {params.rscript} {params.tmpBam} {output.out7};
-        rm {params.tmpBam} {params.tmpBam}.bai;
-        samtools index {output.out5};
-        samtools flagstat {output.out5} > {output.out5f};
-        samtools idxstats {output.out5} > {output.out5i}; 
+            java -Xmx{params.javaram} \\
+                -jar $PICARDJARPATH/picard.jar MarkDuplicates \\
+                -I {input.bam2} \\
+                -O {params.tmpBam} \\
+                -TMP_DIR ${{tmp}} \\
+                -VALIDATION_STRINGENCY SILENT \\
+                -REMOVE_DUPLICATES true \\
+                -METRICS_FILE {output.out6};
+            samtools index {params.tmpBam};
+            samtools view -b {params.tmpBam} chr{{1..22}} > {output.out5};
+            Rscript {params.rscript} {params.tmpBam} {output.out7};
+            rm {params.tmpBam} {params.tmpBam}.bai;
+            samtools index {output.out5};
+            samtools flagstat {output.out5} > {output.out5f};
+            samtools idxstats {output.out5} > {output.out5i}; 
         elif [ '""" + str(paired_end) + """' == False ];then
-        macs2 filterdup -i {input} -g {params.gsize} --keep-dup="auto" -o ${{tmp}}/TmpTagAlign;
-        awk -F"\\t" -v OFS="\\t" '{{if ($2>0 && $3>0) {{print}}}}' ${{tmp}}/TmpTagAlign > ${{tmp}}/TmpTagAlign2;
-        awk -F"\\t" -v OFS="\\t" '{{print $1,1,$2}}' {params.genomefile} | sort -k1,1 -k2,2n > ${{tmp}}/GenomeFileBed;
-        bedtools intersect -wa -f 1.0 -a ${{tmp}}/TmpTagAlign2 -b ${{tmp}}/GenomeFileBed > ${{tmp}}/TmpTagAlign3;
-        bedtools bedtobam -i ${{tmp}}/TmpTagAlign3 -g {params.genomefile} | samtools sort -@4 -o {output.out5};
-        gzip ${{tmp}}/TmpTagAlign3;
-        mv ${{tmp}}/TmpTagAlign3.gz {output.out7};
-        samtools index {output.out5};
-        samtools flagstat {output.out5} > {output.out5f}
-        samtools idxstats {output.out5} > {output.out5i}
+            macs2 filterdup -i {input} -g {params.gsize} --keep-dup="auto" -o ${{tmp}}/TmpTagAlign;
+            awk -F"\\t" -v OFS="\\t" '{{if ($2>0 && $3>0) {{print}}}}' ${{tmp}}/TmpTagAlign > ${{tmp}}/TmpTagAlign2;
+            awk -F"\\t" -v OFS="\\t" '{{print $1,1,$2}}' {params.genomefile} | sort -k1,1 -k2,2n > ${{tmp}}/GenomeFileBed;
+            bedtools intersect -wa -f 1.0 -a ${{tmp}}/TmpTagAlign2 -b ${{tmp}}/GenomeFileBed > ${{tmp}}/TmpTagAlign3;
+            bedtools bedtobam -i ${{tmp}}/TmpTagAlign3 -g {params.genomefile} | samtools sort -@4 -o {output.out5};
+            gzip ${{tmp}}/TmpTagAlign3;
+            mv ${{tmp}}/TmpTagAlign3.gz {output.out7};
+            samtools index {output.out5};
+            samtools flagstat {output.out5} > {output.out5f}
+            samtools idxstats {output.out5} > {output.out5i}
         else
-        java -Xmx{params.javaram} \\
-            -jar $PICARDJARPATH/picard.jar MarkDuplicates \\
-            -I {input.bam2} \\
-            -O {output.out5} \\
-            -TMP_DIR ${{tmp}} \\
-            -VALIDATION_STRINGENCY SILENT \\
-            -REMOVE_DUPLICATES true \\
-            -METRICS_FILE {output.out6};
-        samtools index {output.out5};
-        samtools flagstat {output.out5} > {output.out5f};
-        samtools idxstats {output.out5} > {output.out5i}; 
+            java -Xmx{params.javaram} \\
+                -jar $PICARDJARPATH/picard.jar MarkDuplicates \\
+                -I {input.bam2} \\
+                -O {output.out5} \\
+                -TMP_DIR ${{tmp}} \\
+                -VALIDATION_STRINGENCY SILENT \\
+                -REMOVE_DUPLICATES true \\
+                -METRICS_FILE {output.out6};
+            samtools index {output.out5};
+            samtools flagstat {output.out5} > {output.out5f};
+            samtools idxstats {output.out5} > {output.out5i}; 
         fi
         """
 
@@ -326,14 +323,13 @@ rule ppqt:
         samtoolsver                         = config['tools']['SAMTOOLSVER'],
         rver                                = config['tools']['RVER'],
         scriptPy                            = join(workpath, "bin", "ppqt_process.py"),
-        tmpdir                              = tmpdir,
         file_name                           = "{name}"
     container: 
         config['images']['ppqt']
     shell: 
         """
-        if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi
-        tmp=$(mktemp -d -p "{params.tmpdir}")
+        if [ ! -d \"""" + tmpdir + """\" ]; then mkdir -p \"""" + tmpdir + """\"; fi
+        tmp=$(mktemp -d -p \"""" + tmpdir + """\")
         trap 'rm -rf "${{tmp}}"' EXIT
 
         if [ '""" + str(paired_end) + """' == True ]; then
@@ -380,20 +376,18 @@ rule bam2bw:
         rname                               = "bam2bw",
         name                                = "{name}",
         effectivegenomesize                 = config['references'][genome]['EFFECTIVEGENOMESIZE'],
-        paired_end                          = paired_end,
-        tmpdir                              = tmpdir,
     threads: 
         int(allocated("threads", "bam2bw", cluster)),
     envmodules: 
         config['tools']['DEEPTOOLSVER'],
     shell: 
         """
-        if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi
-        tmp=$(mktemp -d -p "{params.tmpdir}")
+        if [ ! -d \"""" + tmpdir + """\" ]; then mkdir -p \"""" + tmpdir + """\"; fi
+        tmp=$(mktemp -d -p """ + tmpdir + """)
         trap 'rm -rf "${{tmp}}"' EXIT
 
         bam_cov_option={input.ppqt}
-        if [ '{params.paired_end}' == False ]; then
+        if [ \"""" + str(paired_end) + """\" == False ]; then
             ppqt_len=$(awk '{{print $1}}' {input.ppqt})
             bam_cov_option="-e ${{ppqt_len}}"
         else 

From 52cd4153fe916a2bc3eecf9a44320dc1986ac9e5 Mon Sep 17 00:00:00 2001
From: Ryan Routsong <routsongrm@nih.gov>
Date: Mon, 15 Jul 2024 14:10:52 -0400
Subject: [PATCH 15/28] chore: make bin files executable, fix execution issues
 from test data runs

---
 bin/DiffBind_v2_ChIPseq.Rmd       | 0
 bin/DiffBind_v2_ChIPseq_block.Rmd | 0
 bin/DiffBind_v2_cfChIP_QC.Rmd     | 0
 bin/FRiP_plot.R                   | 0
 bin/atac_nrf.py                   | 0
 bin/bam_filter_by_mapq.py         | 0
 bin/frip.py                       | 0
 bin/jaccard_score.py              | 0
 bin/ppqt_process.py               | 0
 bin/prep_diffbind.py              | 0
 bin/prep_diffbindQC.py            | 0
 config/containers.json            | 4 ++--
 src/run.py                        | 2 +-
 workflow/Snakefile                | 3 ++-
 workflow/rules/dba.smk            | 9 +++++++--
 workflow/rules/qc.smk             | 2 +-
 16 files changed, 13 insertions(+), 7 deletions(-)
 mode change 100644 => 100755 bin/DiffBind_v2_ChIPseq.Rmd
 mode change 100644 => 100755 bin/DiffBind_v2_ChIPseq_block.Rmd
 mode change 100644 => 100755 bin/DiffBind_v2_cfChIP_QC.Rmd
 mode change 100644 => 100755 bin/FRiP_plot.R
 mode change 100644 => 100755 bin/atac_nrf.py
 mode change 100644 => 100755 bin/bam_filter_by_mapq.py
 mode change 100644 => 100755 bin/frip.py
 mode change 100644 => 100755 bin/jaccard_score.py
 mode change 100644 => 100755 bin/ppqt_process.py
 mode change 100644 => 100755 bin/prep_diffbind.py
 mode change 100644 => 100755 bin/prep_diffbindQC.py

diff --git a/bin/DiffBind_v2_ChIPseq.Rmd b/bin/DiffBind_v2_ChIPseq.Rmd
old mode 100644
new mode 100755
diff --git a/bin/DiffBind_v2_ChIPseq_block.Rmd b/bin/DiffBind_v2_ChIPseq_block.Rmd
old mode 100644
new mode 100755
diff --git a/bin/DiffBind_v2_cfChIP_QC.Rmd b/bin/DiffBind_v2_cfChIP_QC.Rmd
old mode 100644
new mode 100755
diff --git a/bin/FRiP_plot.R b/bin/FRiP_plot.R
old mode 100644
new mode 100755
diff --git a/bin/atac_nrf.py b/bin/atac_nrf.py
old mode 100644
new mode 100755
diff --git a/bin/bam_filter_by_mapq.py b/bin/bam_filter_by_mapq.py
old mode 100644
new mode 100755
diff --git a/bin/frip.py b/bin/frip.py
old mode 100644
new mode 100755
diff --git a/bin/jaccard_score.py b/bin/jaccard_score.py
old mode 100644
new mode 100755
diff --git a/bin/ppqt_process.py b/bin/ppqt_process.py
old mode 100644
new mode 100755
diff --git a/bin/prep_diffbind.py b/bin/prep_diffbind.py
old mode 100644
new mode 100755
diff --git a/bin/prep_diffbindQC.py b/bin/prep_diffbindQC.py
old mode 100644
new mode 100755
diff --git a/config/containers.json b/config/containers.json
index 5a296de..5329fa7 100644
--- a/config/containers.json
+++ b/config/containers.json
@@ -1,7 +1,7 @@
 {
     "images": {
-        "cfchip": "docker://skchronicles/cfchip_toolkit_v0.5.0",
-        "python": "docker://asyakhleborodova/chrom_seek_python_v0.1.0",
+        "cfchip": "docker://skchronicles/cfchip_toolkit:v0.5.0",
+        "python": "docker://asyakhleborodova/chrom_seek_python:v0.1.0",
         "ppqt": "docker://asyakhleborodova/ppqt:v0.2.0"
     }
 }
diff --git a/src/run.py b/src/run.py
index 16be94e..019d39c 100644
--- a/src/run.py
+++ b/src/run.py
@@ -207,7 +207,7 @@ def setup(sub_args, ifiles, repo_path, output_path):
     # Add other runtime info for debugging
     config['project']['version'] = __version__
     config['project']['workpath'] = os.path.abspath(sub_args.output)
-    config['project']['binpath'] = os.path.abspath(os.path.join(config['project']['workpath'], '..', 'bin'))
+    config['project']['binpath'] = os.path.abspath(os.path.join(config['project']['workpath'], 'bin'))
     git_hash = git_commit_hash(repo_path)
     config['project']['git_commit_hash'] = git_hash   # Add latest git commit hash
     config['project']['pipeline_path'] = repo_path    # Add path to installation
diff --git a/workflow/Snakefile b/workflow/Snakefile
index 4b9c2c7..23c5ae5 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -122,11 +122,12 @@ elif assay in ["atac", "chip"]:
     if assay == "chip":
         rule_all_ins.extend(expand(join(macsB_dir, "{name}", "{name}_peaks.broadPeak"), name=chips))
         rule_all_ins.extend(expand(join(sicer_dir, "{name}", "{name}_broadpeaks.bed"), name=chips))
+        short_ext, tag_ext = ["sorted", "Q5DD"], ["sorted", "Q5DD_tagAlign"]
         if paired_end:
-            short_ext, tag_ext = ["sorted", "Q5DD"], ["sorted", "Q5DD_tagAlign"]
             rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.ppqt"), name=samples, ext=short_ext))
             rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.pdf"), name=samples, ext=short_ext))
             rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.ppqt.txt"), name=samples, ext=short_ext))
+        else:
             rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.ppqt"), name=samples, ext=tag_ext))
             rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.pdf"), name=samples, ext=tag_ext))
             rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.ppqt.txt"), name=samples, ext=tag_ext))
diff --git a/workflow/rules/dba.smk b/workflow/rules/dba.smk
index 42f811e..3c18590 100644
--- a/workflow/rules/dba.smk
+++ b/workflow/rules/dba.smk
@@ -164,10 +164,10 @@ rule UROPA:
         txt                             = join(uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_allhits.txt'),
         bed1                            = temp(join(uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_allhits.bed')),
         bed2                            = temp(join(uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_finalhits.bed')),
+        json                            = join(uropa_dir, '{PeakTool1}', '{name}.{PeakTool2}.{type}.json'),
     params:
         rname                           = "uropa",
         fldr                            = join(uropa_dir, '{PeakTool1}'),
-        json                            = join(uropa_dir, '{PeakTool1}', '{name}.{PeakTool2}.{type}.json'),
         outroot                         = join(uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}'),
     threads: 4,
     run:
@@ -233,8 +233,13 @@ rule UROPA:
                     this_q = base_query.copy()
                     this_q['distance'] = _d
                     json_construct['queries'].append(this_q)
-        with open('{params.json}', 'w') as jo:
+
+        with open('{output.json}', 'w') as jo:
             json.dump(json_construct, jo, indent=4)
+            jo.close()
+
+        if not os.path.exists('{output.json}'):
+            raise FileNotFoundError('{output.json} does not exist!')
         shell("uropa -i {params.json} -p {params.outroot} -t {threads} -s")
 
 
diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk
index 48343f9..d07ff71 100644
--- a/workflow/rules/qc.smk
+++ b/workflow/rules/qc.smk
@@ -74,7 +74,7 @@ rule NRF:
         samtoolsver             = config['tools']['SAMTOOLSVER'],
         rver                    = config['tools']['RVER'],
         preseqver               = config['tools']['PRESEQVER'],
-        nrfscript               = join(workpath, "workflow", "scripts", "atac_nrf.py"),
+        nrfscript               = join(bin_path, "atac_nrf.py"),
     threads: 16
     shell: 
         """

From b54ac886c185f5fea2e2a4cb0c78620f6d07293a Mon Sep 17 00:00:00 2001
From: Ryan Routsong <routsongrm@nih.gov>
Date: Mon, 22 Jul 2024 16:11:35 -0400
Subject: [PATCH 16/28] fix: working out bugs discovered on AV

---
 bin/prep_diffbind.py                |  8 ++++----
 bin/prep_diffbindQC.py              |  4 ++--
 src/run.sh                          |  4 ++--
 workflow/Snakefile                  |  2 +-
 workflow/rules/dba.smk              | 28 ++++++++++++++--------------
 workflow/rules/hooks.smk            |  4 ++--
 workflow/rules/qc.smk               | 10 ++++++----
 workflow/rules/trim_align_dedup.smk |  7 +++----
 workflow/scripts/blocking.py        | 16 +++++++++++-----
 workflow/scripts/grouping.py        |  3 ++-
 10 files changed, 47 insertions(+), 39 deletions(-)

diff --git a/bin/prep_diffbind.py b/bin/prep_diffbind.py
index 4ba7d64..fc96cd7 100755
--- a/bin/prep_diffbind.py
+++ b/bin/prep_diffbind.py
@@ -23,20 +23,20 @@
 blocks = config['project']['blocks']
 
 if None in list(blocks.values()):
-    samplesheet = [",".join(["SampleID","Condition", "Replicate", "bamReads", 
+    samplesheet = [",".join(["SampleID", "Condition", "Replicate", "bamReads", 
          "ControlID", "bamControl", "Peaks", "PeakCaller"])]
 else:
-    samplesheet = [",".join(["SampleID","Condition","Treatment","Replicate", "bamReads", 
+    samplesheet = [",".join(["SampleID", "Condition", "Treatment", "Replicate", "bamReads", 
          "ControlID", "bamControl", "Peaks", "PeakCaller"])]
    
 
 for condition in args.group1, args.group2:
     for chip in groupdata[condition]:
         replicate = str([ i + 1 for i in range(len(groupdata[condition])) if groupdata[condition][i]== chip ][0])
-        bamReads = args.workpath + "/" + args.bamdir + "/" + chip + ".Q5DD.bam"
+        bamReads = args.bamdir + "/" + chip + ".Q5DD.bam"
         controlID = chip2input[chip]
         if controlID != "":
-            bamControl = args.workpath + "/" + args.bamdir + "/" +  controlID + ".Q5DD.bam"
+            bamControl = args.bamdir + "/" +  controlID + ".Q5DD.bam"
         else:
             bamControl = ""
         peaks = args.workpath + "/" + args.peaktool + "/" + chip + "/" + chip + args.peakextension
diff --git a/bin/prep_diffbindQC.py b/bin/prep_diffbindQC.py
index 550b5f9..59c19d3 100755
--- a/bin/prep_diffbindQC.py
+++ b/bin/prep_diffbindQC.py
@@ -36,10 +36,10 @@
       condition = ""
       replicate = str(count)
       count = count +1
-   bamReads = args.workpath + "/" + args.bamdir + "/" + chip + ".Q5DD.bam"
+   bamReads = args.bamdir + "/" + chip + ".Q5DD.bam"
    controlID = chip2input[chip]
    if controlID != "":
-      bamControl = args.workpath + "/" + args.bamdir + "/" +  controlID + ".Q5DD.bam"
+      bamControl = args.bamdir + "/" +  controlID + ".Q5DD.bam"
    else:
       bamControl = ""
    peaks = args.workpath + "/" + args.peaktool + "/" + chip + "/" + chip + args.peakextension
diff --git a/src/run.sh b/src/run.sh
index 9cade43..83315ea 100755
--- a/src/run.sh
+++ b/src/run.sh
@@ -209,7 +209,6 @@ function submit(){
           if [[ ${6#\'} != /lscratch* ]]; then
             CLUSTER_OPTS="sbatch --cpus-per-task {cluster.threads} -p {cluster.partition} -t {cluster.time} --mem {cluster.mem} --job-name={params.rname} -e $SLURM_DIR/slurm-%j_{params.rname}.out -o $SLURM_DIR/slurm-%j_{params.rname}.out {cluster.ntasks} {cluster.ntasks_per_core} {cluster.exclusive}"
           fi
-          # Create sbacth script to build index
     cat << EOF > kickoff.sh
 #!/usr/bin/env bash
 #SBATCH --cpus-per-task=16 
@@ -226,7 +225,8 @@ snakemake --latency-wait 120 -s "$3/workflow/Snakefile" -d "$3" \\
   --use-singularity --singularity-args "'-B $4'" \\
   --use-envmodules --configfile="$3/config.json" \\
   --printshellcmds --cluster-config "$3/config/cluster.json" \\
-  --cluster "${CLUSTER_OPTS}" --keep-going --restart-times 3 -j 500 \\
+  --cluster "${CLUSTER_OPTS}" --keep-going -j 500 \\
+  --keep-incomplete --restart-times 1 \\
   --rerun-incomplete --stats "$3/logfiles/runtime_statistics.json" \\
   --keep-remote --local-cores 14 2>&1
 # Create summary report
diff --git a/workflow/Snakefile b/workflow/Snakefile
index 23c5ae5..91a4240 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -33,7 +33,7 @@ groups                          = list(groupdatawinput.keys())
 reps                            = True if len(groupswreps) > 0 else False
 uniq_inputs                     = list(sorted(set([v for v in chip2input.values() if v])))
 sampleswinput                   = [
-    chip_value for input_id, chip_value in chip2input.items() \
+    chip_value for chip_value, input_id in chip2input.items() \
     if chip_value != 'NA' and chip_value != ''
 ]
 inputnorm                       = [""] if len(sampleswinput) == 0 else ["", ".inputnorm"]
diff --git a/workflow/rules/dba.smk b/workflow/rules/dba.smk
index 3c18590..2fbc15b 100644
--- a/workflow/rules/dba.smk
+++ b/workflow/rules/dba.smk
@@ -172,16 +172,15 @@ rule UROPA:
     threads: 4,
     run:
         # Dynamically creates UROPA config file
-        shell(f"module load {uropaver}")
-        if not os.path.exists("{params.fldr}"): 
-            os.mkdir("{params.fldr}", mode=0o775)
+        if not os.path.exists(params.fldr): 
+            os.mkdir(params.fldr, mode=0o775)
 
         json_construct = dict()
         json_construct['queries'] = []
         json_construct['show_attributes'] = ["gene_id", "gene_name", "gene_type"]
         json_construct["priority"] = "Yes"
         json_construct['gtf'] = gtf
-        json_construct['bed'] = "{input}"
+        json_construct['bed'] = input[0]
 
         base_query = {
             "feature": "gene",
@@ -191,19 +190,19 @@ rule UROPA:
         }
 
         if assay == 'cfchip':
-            if '{type}' == 'protTSS':
+            if wildcards.type == 'protTSS':
                 for _d in (3000, 10000, 100000):
                     this_q = base_query.copy()
                     this_q['distance'] = _d
                     json_construct['queries'].append(this_q)
         else:
-            if '{type}' == 'prot':
+            if wildcards.type == 'prot':
                 for _d in (5000, 100000):
                     this_q = base_query.copy()
                     del this_q["feature.anchor"]
                     this_q['distance'] = _d
                     json_construct['queries'].append(this_q)
-            elif '{type}' == 'genes':
+            elif wildcards.type == 'genes':
                 this_query = {}
                 this_query['feature'] = 'gene'
                 for _d in (5000, 100000):
@@ -213,7 +212,7 @@ rule UROPA:
                     del this_q["attribute.value"]
                     this_q['distance'] = _d
                     json_construct['queries'].append(this_q)
-            elif '{type}' == 'protSEC':
+            elif wildcards.type == 'protSEC':
                 # distance, feature.anchor
                 query_values = (
                     ([3000, 1000], "start"), 
@@ -226,21 +225,22 @@ rule UROPA:
                     del this_q["feature.anchor"]
                     if feature_anchor: 
                         this_q["feature.anchor"] = feature_anchor
-                    this_q['distance'] = _d
+                    this_q['distance'] = _distance
                     json_construct['queries'].append(this_q)
-            elif '{type}' == 'protTSS':
+            elif wildcards.type == 'protTSS':
                 for _d in ([3000, 1000], 10000, 100000):
                     this_q = base_query.copy()
                     this_q['distance'] = _d
                     json_construct['queries'].append(this_q)
 
-        with open('{output.json}', 'w') as jo:
+        with open(output.json, 'w') as jo:
             json.dump(json_construct, jo, indent=4)
             jo.close()
 
-        if not os.path.exists('{output.json}'):
-            raise FileNotFoundError('{output.json} does not exist!')
-        shell("uropa -i {params.json} -p {params.outroot} -t {threads} -s")
+        if not os.path.exists(output.json):
+            raise FileNotFoundError(output.json + " does not exist!")
+        shell.prefix(f"module load {uropaver};")
+        shell("uropa -i " + output.json + " -p " + params.outroot + " -t " + str(threads) + " -s")
 
 
 rule manorm:
diff --git a/workflow/rules/hooks.smk b/workflow/rules/hooks.smk
index dfa4f47..1c8666f 100644
--- a/workflow/rules/hooks.smk
+++ b/workflow/rules/hooks.smk
@@ -31,7 +31,7 @@ if config['options']['mode'] == 'slurm':
             # previously submitted jobs 
             sleep 15; rm -f COMPLETED FAILED RUNNING;
             timestamp=$(date +"%Y-%m-%d_%H-%M-%S");
-            ./workflow/scripts/jobby \\
+            ./bin/jobby \\
                 $(grep --color=never "^Submitted .* external jobid" logfiles/snakemake.log \\
                     | awk '{{print $NF}}' \\
                     | sed "s/['.]//g" \\
@@ -68,7 +68,7 @@ if config['options']['mode'] == 'slurm':
             # previously submitted jobs 
             sleep 15; rm -f COMPLETED FAILED RUNNING;
             timestamp=$(date +"%Y-%m-%d_%H-%M-%S");
-            ./workflow/scripts/jobby \\
+            ./bin/jobby \\
                 $(grep --color=never "^Submitted .* external jobid" logfiles/snakemake.log \\
                     | awk '{{print $NF}}' \\
                     | sed "s/['.]//g" \\
diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk
index d07ff71..d97f6ea 100644
--- a/workflow/rules/qc.smk
+++ b/workflow/rules/qc.smk
@@ -370,8 +370,7 @@ rule insert_size:
 
 rule deeptools_QC:
     input:
-        # this should be all bigwigs
-        [ join(workpath, bw_dir, name + ".Q5DD.RPGC.bw") for name in samples ] 
+        [ join(bw_dir, name + ".Q5DD.RPGC.bw") for name in samples ] 
     output:
         javaram                 = '16g',
         heatmap                 = join(deeptools_dir, "spearman_heatmap.Q5DD.pdf"),
@@ -380,13 +379,16 @@ rule deeptools_QC:
 	    png                     = join(deeptools_dir, "spearman_heatmap.Q5DD_mqc.png")
     params:
         rname                   = "deeptools_QC",
+        parent_dir              = deeptools_dir,
         deeptoolsver            = config['tools']['DEEPTOOLSVER'],
         # this should be the sample names to match the bigwigs in the same order
-        labels                  = samples 
+        labels                  = samples
+    threads: 24
     shell: 
         """    
         module load {params.deeptoolsver}
-        multiBigwigSummary bins -b {input} -l {params.labels} -out {output.npz}
+        if [ ! -d "{params.parent_dir}" ]; then mkdir "{params.parent_dir}"; fi
+        multiBigwigSummary bins -b {input} -p {threads} -l {params.labels} -out {output.npz}
         plotCorrelation -in {output.npz} -o {output.heatmap} -c 'spearman' -p 'heatmap' --skipZeros --removeOutliers
         plotCorrelation -in {output.npz} -o {output.png} -c 'spearman' -p 'heatmap' --skipZeros --removeOutliers
         plotPCA -in {output.npz} -o {output.pca}
diff --git a/workflow/rules/trim_align_dedup.smk b/workflow/rules/trim_align_dedup.smk
index 5665956..fa6cf00 100644
--- a/workflow/rules/trim_align_dedup.smk
+++ b/workflow/rules/trim_align_dedup.smk
@@ -418,20 +418,19 @@ rule inputnorm:
        bigWig file of treatmment sample normalizes with its input control
     """
     input:
-        bws                                 = lambda w: ctrl_test(chip2input, w.name, bw_dir)
+        chip                                = lambda wc: ctrl_test(chip2input, wc.name, bw_dir, 'chip'),
+        ctrl                                = lambda wc: ctrl_test(chip2input, wc.name, bw_dir, 'ctrl')
     output:
         join(bw_dir, "{name}.Q5DD.RPGC.inputnorm.bw")
     params:
         rname                               = "inputnorm",
-        bigwig_declare                      = lambda w, input: f"--bigwig1 {input.bws[0]}" if len(input.bws) == 1 \
-                                                else f"--bigwig1 {input.bws[0]} --bigwig2 {input.bws[1]}"
+        bigwig_declare                      = lambda wc, input: f"--bigwig1 {input.chip} --bigwig2 {input.ctrl}",
     threads: 
         int(allocated("threads", "inputnorm", cluster)),
     envmodules: 
         config['tools']['DEEPTOOLSVER'],
     shell: 
         """
-        echo {input}
         bigwigCompare \\
             --binSize 25 \\
             --outFileName {output} \\
diff --git a/workflow/scripts/blocking.py b/workflow/scripts/blocking.py
index 9f3febb..270376b 100644
--- a/workflow/scripts/blocking.py
+++ b/workflow/scripts/blocking.py
@@ -1,10 +1,10 @@
 #!/usr/bin/env python3
+import os
 from os.path import join
+from collections import defaultdict
 
 
 # ~~~ Common helper functions for blocking or controls
-
-
 def test_for_block(groupdata, contrast, blocks):
     """ only want to run blocking on contrasts where all
     individuals are on both sides of the contrast """
@@ -20,9 +20,15 @@ def test_for_block(groupdata, contrast, blocks):
     return contrastBlock
 
 
-def ctrl_test(ctrl_dict, input_name, in_dir):
+def ctrl_test(ctrl_dict, input_name, in_dir, mode=None):
     sample = join(in_dir, f"{input_name}.Q5DD.RPGC.bw")
+    assert mode in ('chip', 'ctrl'), 'Unrecognized input file mode.'
+    # assert os.path.exists(sample), f'{sample} sample does not exist!'
+    
     if input_name in ctrl_dict:
         norm = join(in_dir, ctrl_dict[input_name] + ".Q5DD.RPGC.bw")
-        return [sample, norm]
-    return [sample]
\ No newline at end of file
+        # assert os.path.exists(norm), f'{norm} control does not exist!'
+    else:
+        raise ValueError(f'ChIP sample {input_name} missing from input lookup: \n{str(ctrl_dict)}')
+    outs = {'chip': sample, 'ctrl': norm}
+    return outs[mode]
\ No newline at end of file
diff --git a/workflow/scripts/grouping.py b/workflow/scripts/grouping.py
index b8214cb..09c9fd4 100644
--- a/workflow/scripts/grouping.py
+++ b/workflow/scripts/grouping.py
@@ -106,4 +106,5 @@ def get_bam_input(bam_dir, wildcards, paired_end):
             bams.append(join(bam_dir, "{0}.Q5DD.bam".format(wildcards.name)))
         elif wildcards.ext == "sorted":
             bams.append(join(bam_dir, "{0}.sorted.bam".format(wildcards.name)))
-    return bams
\ No newline at end of file
+    return bams
+

From 2f6aff81654684a00a78523db22b19f396e4ba19 Mon Sep 17 00:00:00 2001
From: Ryan Routsong <routsongrm@nih.gov>
Date: Mon, 29 Jul 2024 12:42:36 -0400
Subject: [PATCH 17/28] fix: testing corrections

---
 workflow/Snakefile                  | 111 +++++++++++++++-------------
 workflow/chrom-seek.code-workspace  |  13 ++++
 workflow/rules/peakcall.smk         |   2 +-
 workflow/rules/qc.smk               |   1 -
 workflow/rules/trim_align_dedup.smk |   1 +
 5 files changed, 74 insertions(+), 54 deletions(-)
 create mode 100644 workflow/chrom-seek.code-workspace

diff --git a/workflow/Snakefile b/workflow/Snakefile
index 91a4240..b2c7c42 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -21,6 +21,7 @@ paired_end                      = False if config['project']['nends'] == 1 else
 chips                           = config['project']['peaks']['chips']
 contrast                        = config['project']['contrast']
 chip2input                      = config['project']['peaks']['inputs']
+has_inputs                      = False if set(chip2input.values()) in ({''}, {None}) else True
 groupdata                       = config['project']['groups']
 peak_types                      = config['options']['peak_type_base']
 rule_all_ins                    = []
@@ -100,28 +101,32 @@ if assay == "cfchip":
             name=contrasts,
              _type=["protTSS"]
         ))
-    # else:
-    #     rule_all_ins.extend(expand(
-    #         join(uropa_dir, '{PeakTool}', '{name}_{PeakTool}_uropa_{_type}_allhits.txt'), 
-    #         PeakTool="MANorm", 
-    #         name=contrasts, 
-    #         _type=UropaCats
-    #     ))
-    #     rule_all_ins.extend(expand(
-    #         join(manorm_dir, "{group1}_vs_{group2}-{tool}", "{group1}_vs_{group2}-{tool}_all_MAvalues.xls"), 
-    #         group1=zipGroup1, 
-    #         group2=zipGroup2, 
-    #         tool=zipToolC
-    #     ))
+    else:
+        rule_all_ins.extend(expand(
+            join(uropa_dir, '{PeakTool}', '{name}_{PeakTool}_uropa_{_type}_allhits.txt'), 
+            PeakTool="MANorm", 
+            name=contrasts, 
+            _type=UropaCats
+        ))
+        rule_all_ins.extend(expand(
+            join(manorm_dir, "{group1}_vs_{group2}-{tool}", "{group1}_vs_{group2}-{tool}_all_MAvalues.xls"), 
+            group1=zipGroup1, 
+            group2=zipGroup2, 
+            tool=zipToolC
+        ))
 elif assay in ["atac", "chip"]:
     peak_types.extend(["prot", "protSEC", "genes"])
-    rule_all_ins.extend(expand(join(MEME_dir, "{PeakTool}", "{name}_meme", "meme-chip.html"), PeakTool=PeakTools, name=chips))
-    rule_all_ins.extend(expand(join(MEME_dir, "{PeakTool}", "{name}_ame", "ame.html"), PeakTool=PeakTools, name=chips))
+    # meme outputs turned off for now
+    # if has_inputs:
+    #     rule_all_ins.extend(expand(join(MEME_dir, "{PeakTool}", "{name}_meme", "meme-chip.html"), PeakTool=PeakTools, name=chips))
+    #     rule_all_ins.extend(expand(join(MEME_dir, "{PeakTool}", "{name}_ame", "ame.html"), PeakTool=PeakTools, name=chips))
     if paired_end:
         rule_all_ins.extend(expand(join(qc_dir, "{name}.{stem}.insert_size_metrics.txt"), name=samples, stem=file_stems))
     if assay == "chip":
         rule_all_ins.extend(expand(join(macsB_dir, "{name}", "{name}_peaks.broadPeak"), name=chips))
-        rule_all_ins.extend(expand(join(sicer_dir, "{name}", "{name}_broadpeaks.bed"), name=chips))
+        # sicer outputs turned off for now
+        # if has_inputs:
+        #     rule_all_ins.extend(expand(join(sicer_dir, "{name}", "{name}_broadpeaks.bed"), name=chips))
         short_ext, tag_ext = ["sorted", "Q5DD"], ["sorted", "Q5DD_tagAlign"]
         if paired_end:
             rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.ppqt"), name=samples, ext=short_ext))
@@ -136,51 +141,53 @@ elif assay in ["atac", "chip"]:
             join(genrich_dir, "{name}", "{name}.narrowPeak"), name=chips
         ))
     if reps:
-        rule_all_ins.extend(expand(
-            join(uropa_dir, "{PeakTool}", "{name}_{PeakTool}_uropa_{_type}_allhits.txt"),
-            PeakTool=PeakTools, name=chips, _type=peak_types
-        ))
-        rule_all_ins.extend(expand(
-            join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind.html"), 
-            group1=zipGroup1, group2=zipGroup2, PeakTool=zipToolC
-        ))
-        rule_all_ins.extend(expand(
-            join(uropa_dir, "{name}_{PeakTool}_uropa_{_type}_allhits.txt"), 
-            PeakTool=["DiffbindEdgeR", "DiffbindDeseq2"], 
-            name=contrasts, 
-            _type=peak_types
-        ))
+        if has_inputs:
+            rule_all_ins.extend(expand(
+                join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind.html"), 
+                group1=zipGroup1, group2=zipGroup2, PeakTool=zipToolC
+            ))
+            rule_all_ins.extend(expand(
+                join(uropa_dir, "{PeakTool}", "{name}_{PeakTool}_uropa_{_type}_allhits.txt"),
+                PeakTool=PeakTools, name=chips, _type=peak_types
+            ))
+            rule_all_ins.extend(expand(
+                join(uropa_dir, "{name}_{PeakTool}_uropa_{_type}_allhits.txt"), 
+                PeakTool=["DiffbindEdgeR", "DiffbindDeseq2"], 
+                name=contrasts, 
+                _type=peak_types
+            ))
         if contrast:
             rule_all_ins.extend(expand(
                 join(uropa_dir, "promoterTable2", "DiffbindDeseq2_{PeakTool}_promoter_overlap_summaryTable.txt"), 
                 PeakTool=PeakTools
             ))
-    # else:
-    #     rule_all_ins.extend(expand(
-    #         join(uropa_dir, '{PeakTool}', '{name}_{PeakTool}_uropa_{_type}_allhits.txt'), 
-    #         PeakTool="MANorm", 
-    #         name=contrasts, 
-    #         _type=UropaCats
-    #     ))
-    #     rule_all_ins.extend(expand(
-    #         join(manorm_dir, "{group1}_vs_{group2}-{tool}", "{group1}_vs_{group2}-{tool}_all_MAvalues.xls"), 
-    #         group1=zipGroup1, 
-    #         group2=zipGroup2, 
-    #         tool=zipToolC
-    #     ))
-
-rule_all_ins.append(join(workpath,"multiqc_report.html"))
+    else:
+        rule_all_ins.extend(expand(
+            join(uropa_dir, '{PeakTool}', '{name}_{PeakTool}_uropa_{_type}_allhits.txt'), 
+            PeakTool="MANorm", 
+            name=contrasts, 
+            _type=UropaCats
+        ))
+        rule_all_ins.extend(expand(
+            join(manorm_dir, "{group1}_vs_{group2}-{tool}", "{group1}_vs_{group2}-{tool}_all_MAvalues.xls"), 
+            group1=zipGroup1, 
+            group2=zipGroup2, 
+            tool=zipToolC
+        ))
+rule_all_ins.append(join(workpath, "multiqc_report.html"))
 rule_all_ins.extend(expand(join(qc_dir, "{name}.preseq.dat"), name=samples))
-rule_all_ins.extend(
-    expand(join(qc_dir, "AllSamples-{PeakTool}", "AllSamples-{PeakTool}_DiffBindQC_TMMcounts.bed"), PeakTool=PeakTools)
-)
+if has_inputs:
+    rule_all_ins.extend(
+        expand(join(qc_dir, "AllSamples-{PeakTool}", "AllSamples-{PeakTool}_DiffBindQC_TMMcounts.bed"), PeakTool=PeakTools)
+    )
+    rule_all_ins.extend(
+        expand(join(peakqc_dir, "{PeakTool}.{name}.Q5DD.FRiP_table.txt"), PeakTool=PeakTools, name=samples)
+    )
 rule_all_ins.extend(expand(join(bam_dir, "{name}.{ext}"), name=samples, ext=extaln))
 rule_all_ins.extend(expand(join(macsN_dir, "{name}","{name}_peaks.narrowPeak"), name=chips))
-rule_all_ins.extend(
-    expand(join(peakqc_dir, "{PeakTool}.{name}.Q5DD.FRiP_table.txt"), PeakTool=PeakTools, name=samples)
-)
 rule_all_ins.extend(expand(join(bw_dir, "{name}.{ext}.RPGC.bw"), name=samples, ext=["sorted", "Q5DD"]))
-rule_all_ins.extend(expand(join(bw_dir, "{name}.Q5DD.RPGC.inputnorm.bw"), name=sampleswinput))
+if has_inputs:
+    rule_all_ins.extend(expand(join(bw_dir, "{name}.Q5DD.RPGC.inputnorm.bw"), name=sampleswinput))
 
 rule all:
     input:
diff --git a/workflow/chrom-seek.code-workspace b/workflow/chrom-seek.code-workspace
new file mode 100644
index 0000000..ba2accd
--- /dev/null
+++ b/workflow/chrom-seek.code-workspace
@@ -0,0 +1,13 @@
+{
+	"folders": [
+		{
+			"path": ".."
+		},
+		{
+			"path": "../../../../../../data/OpenOmics/dev/datasets"
+		}
+	],
+	"settings": {
+		"r.lsp.promptToInstall": false
+	}
+}
\ No newline at end of file
diff --git a/workflow/rules/peakcall.smk b/workflow/rules/peakcall.smk
index e81f5a4..fd8a520 100644
--- a/workflow/rules/peakcall.smk
+++ b/workflow/rules/peakcall.smk
@@ -173,7 +173,7 @@ rule SICER:
         fragLen                         = lambda w: getSicerFragLen(ppqt_dir, qc_dir, w.name, paired_end),
         c_option                        = lambda w: get_control_input(chip2input[w.name], paired_end, bam_dir),
     output:
-        bed                             = join(sicer_dir, "{name}", "{name}_broadpeaks.bed"),
+        bed                             = join(sicer_dir, "{name}", "{name}_broadpeaks.bed") if has_inputs else [],
     params:
         rname                           = 'SICER',
         name                            = "{name}",
diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk
index d97f6ea..61b02ea 100644
--- a/workflow/rules/qc.smk
+++ b/workflow/rules/qc.smk
@@ -372,7 +372,6 @@ rule deeptools_QC:
     input:
         [ join(bw_dir, name + ".Q5DD.RPGC.bw") for name in samples ] 
     output:
-        javaram                 = '16g',
         heatmap                 = join(deeptools_dir, "spearman_heatmap.Q5DD.pdf"),
         pca                     = join(deeptools_dir, "pca.Q5DD.pdf"),
 	    npz                     = temp(join(deeptools_dir, "Q5DD.npz")),
diff --git a/workflow/rules/trim_align_dedup.smk b/workflow/rules/trim_align_dedup.smk
index fa6cf00..0198ea5 100644
--- a/workflow/rules/trim_align_dedup.smk
+++ b/workflow/rules/trim_align_dedup.smk
@@ -13,6 +13,7 @@ genome                          = config['options']['genome']
 paired_end                      = False if config['project']['nends'] == 1 else True
 ends                            = [1] if not paired_end else [1, 2]
 chip2input                      = config['project']['peaks']['inputs']
+has_inputs                      = False if set(chip2input.values()) == {''} else True
 
 # ~~ directories
 trim_dir                        = join(workpath, 'trim')

From 710cae0ea2a78ddd631dbeb87d2feb8086be13a0 Mon Sep 17 00:00:00 2001
From: Ryan Routsong <routsongrm@nih.gov>
Date: Wed, 31 Jul 2024 14:33:23 -0400
Subject: [PATCH 18/28] fix: comment out manorm rules for now

---
 workflow/Snakefile                  | 26 ++++++++++++++------------
 workflow/rules/dba.smk              |  5 +++--
 workflow/rules/trim_align_dedup.smk |  3 ++-
 workflow/scripts/peakcall.py        |  2 ++
 4 files changed, 21 insertions(+), 15 deletions(-)

diff --git a/workflow/Snakefile b/workflow/Snakefile
index b2c7c42..eaf2aef 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -108,12 +108,13 @@ if assay == "cfchip":
             name=contrasts, 
             _type=UropaCats
         ))
-        rule_all_ins.extend(expand(
-            join(manorm_dir, "{group1}_vs_{group2}-{tool}", "{group1}_vs_{group2}-{tool}_all_MAvalues.xls"), 
-            group1=zipGroup1, 
-            group2=zipGroup2, 
-            tool=zipToolC
-        ))
+        # manorm commented turned off now
+        # rule_all_ins.extend(expand(
+        #     join(manorm_dir, "{group1}_vs_{group2}-{tool}", "{group1}_vs_{group2}-{tool}_all_MAvalues.xls"), 
+        #     group1=zipGroup1, 
+        #     group2=zipGroup2, 
+        #     tool=zipToolC
+        # ))
 elif assay in ["atac", "chip"]:
     peak_types.extend(["prot", "protSEC", "genes"])
     # meme outputs turned off for now
@@ -168,12 +169,13 @@ elif assay in ["atac", "chip"]:
             name=contrasts, 
             _type=UropaCats
         ))
-        rule_all_ins.extend(expand(
-            join(manorm_dir, "{group1}_vs_{group2}-{tool}", "{group1}_vs_{group2}-{tool}_all_MAvalues.xls"), 
-            group1=zipGroup1, 
-            group2=zipGroup2, 
-            tool=zipToolC
-        ))
+        # manorm commented turned off now
+        # rule_all_ins.extend(expand(
+        #     join(manorm_dir, "{group1}_vs_{group2}-{tool}", "{group1}_vs_{group2}-{tool}_all_MAvalues.xls"), 
+        #     group1=zipGroup1, 
+        #     group2=zipGroup2, 
+        #     tool=zipToolC
+        # ))
 rule_all_ins.append(join(workpath, "multiqc_report.html"))
 rule_all_ins.extend(expand(join(qc_dir, "{name}.preseq.dat"), name=samples))
 if has_inputs:
diff --git a/workflow/rules/dba.smk b/workflow/rules/dba.smk
index 2fbc15b..e85fc6e 100644
--- a/workflow/rules/dba.smk
+++ b/workflow/rules/dba.smk
@@ -26,6 +26,7 @@ diffbind_dir                    = join(workpath, "DiffBind")
 uropa_dir                       = join(workpath, "UROPA_annotations")
 uropa_diffbind_dir              = join(uropa_dir, "DiffBind")
 bam_dir                         = join(workpath, "bam")
+ppqt_dir                        = join(bam_dir, "ppqt")
 qc_dir                          = join(workpath, "PeakQC")
 idr_dir                         = join(workpath, "IDR")
 memechip_dir                    = join(workpath, "MEME")
@@ -247,7 +248,7 @@ rule manorm:
     input:
         bam1                            = lambda w: join(bam_dir, groupdata[w.group1][0] + ".Q5DD.bam"),
         bam2                            = lambda w: join(bam_dir, groupdata[w.group2][0] + ".Q5DD.bam"),
-        ppqt                            = join(ppqt_dir, "Q5DD.ppqt.txt"),
+        # ppqt                            = join(ppqt_dir, "Q5DD.ppqt.txt"), # ppqt input into manorm TODO
         peak1                           = lambda w: join(workpath, w.tool, groupdata[w.group1][0], groupdata[w.group1][0] + PeakExtensions[w.tool]),
         peak2                           = lambda w: join(workpath, w.tool, groupdata[w.group2][0], groupdata[w.group2][0] + PeakExtensions[w.tool]),
     output:
@@ -261,7 +262,7 @@ rule manorm:
         fldr                            = join(manorm_dir, "{group1}_vs_{group2}-{tool}"),
         bedtoolsver                     = config['tools']['BEDTOOLSVER'],
         manormver                       = "manorm/1.1.4",
-        extsizes                        = lambda w, _in: get_manorm_sizes(w.group1, w.group2, groupdata, _in.ppqt)
+        extsizes                        = lambda w, input: get_manorm_sizes(w.group1, w.group2, groupdata, "")
     shell:
         """
         if [ ! -e /lscratch/$SLURM_JOBID ]; then 
diff --git a/workflow/rules/trim_align_dedup.smk b/workflow/rules/trim_align_dedup.smk
index 0198ea5..0b8cc18 100644
--- a/workflow/rules/trim_align_dedup.smk
+++ b/workflow/rules/trim_align_dedup.smk
@@ -9,6 +9,7 @@ from scripts.blocking import ctrl_test
 
 # ~~ workflow configuration
 workpath                        = config['project']['workpath']
+bin_path                        = config['project']['binpath']
 genome                          = config['options']['genome']
 paired_end                      = False if config['project']['nends'] == 1 else True
 ends                            = [1] if not paired_end else [1, 2]
@@ -323,7 +324,7 @@ rule ppqt:
         rname                               = "ppqt",
         samtoolsver                         = config['tools']['SAMTOOLSVER'],
         rver                                = config['tools']['RVER'],
-        scriptPy                            = join(workpath, "bin", "ppqt_process.py"),
+        scriptPy                            = join(bin_path, "ppqt_process.py"),
         file_name                           = "{name}"
     container: 
         config['images']['ppqt']
diff --git a/workflow/scripts/peakcall.py b/workflow/scripts/peakcall.py
index fb03a3c..0d86cff 100644
--- a/workflow/scripts/peakcall.py
+++ b/workflow/scripts/peakcall.py
@@ -106,6 +106,8 @@ def getSicerFragLen(ppqt_dir, qc_dir, name, paired_end):
 
 
 def get_manorm_sizes(g1, g2, group_data, ppqt_in):
+    if not ppqt_in:
+        return ""
     file = lambda w, _in: list(map(lambda z: z.strip().split(), open(ppqt_in, 'r').readlines()))
     extsize1 = [ppqt[1] for ppqt in file if ppqt[0] == group_data[g1]][0]
     extsize2 = [ppqt[1] for ppqt in file if ppqt[0] == group_data[g2]][0]

From 53a7949eef583a3c1552df2718cb35bcde6aafeb Mon Sep 17 00:00:00 2001
From: Ryan Routsong <routsongrm@nih.gov>
Date: Wed, 31 Jul 2024 14:37:50 -0400
Subject: [PATCH 19/28] fix: turn off sicer involved inputs in cfchip pipeline

---
 workflow/scripts/grouping.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/workflow/scripts/grouping.py b/workflow/scripts/grouping.py
index 09c9fd4..8d78899 100644
--- a/workflow/scripts/grouping.py
+++ b/workflow/scripts/grouping.py
@@ -70,7 +70,9 @@ def get_peaktools(assay_type):
     if assay_type == "atac": 
         tools.append("Genrich") 
     elif assay_type == "chip":
-        tools.extend(["macsBroad", "sicer"])
+        tools.extend(["macsBroad"])
+        # turn sicer off for now
+        # tools.extend(["macsBroad", "sicer"])
     return tools
 
 

From 1779a9175dc30197285a039844a57b2deda005c1 Mon Sep 17 00:00:00 2001
From: Ryan Routsong <routsongrm@nih.gov>
Date: Wed, 31 Jul 2024 15:47:27 -0400
Subject: [PATCH 20/28] fix: realign uropa, promotertable2, diffbind outputs
 inputs

---
 workflow/Snakefile        | 44 +++++++++++++++++++--------------------
 workflow/rules/cfChIP.smk |  3 ++-
 workflow/rules/dba.smk    | 15 +++++++------
 3 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/workflow/Snakefile b/workflow/Snakefile
index eaf2aef..6f16496 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -101,6 +101,11 @@ if assay == "cfchip":
             name=contrasts,
              _type=["protTSS"]
         ))
+        if contrast:
+            rule_all_ins.extend(expand(
+                join(uropa_dir, "promoterTable2", "DiffbindDeseq2_{PeakTool}_promoter_overlap_summaryTable.txt"), 
+                PeakTool=PeakTools
+            ))
     else:
         rule_all_ins.extend(expand(
             join(uropa_dir, '{PeakTool}', '{name}_{PeakTool}_uropa_{_type}_allhits.txt'), 
@@ -108,13 +113,13 @@ if assay == "cfchip":
             name=contrasts, 
             _type=UropaCats
         ))
-        # manorm commented turned off now
-        # rule_all_ins.extend(expand(
-        #     join(manorm_dir, "{group1}_vs_{group2}-{tool}", "{group1}_vs_{group2}-{tool}_all_MAvalues.xls"), 
-        #     group1=zipGroup1, 
-        #     group2=zipGroup2, 
-        #     tool=zipToolC
-        # ))
+        rule_all_ins.extend(expand(
+            join(manorm_dir, "{group1}_vs_{group2}-{tool}", "{group1}_vs_{group2}-{tool}_all_MAvalues.xls"), 
+            group1=zipGroup1, 
+            group2=zipGroup2, 
+            tool=zipToolC
+        ))
+
 elif assay in ["atac", "chip"]:
     peak_types.extend(["prot", "protSEC", "genes"])
     # meme outputs turned off for now
@@ -151,16 +156,12 @@ elif assay in ["atac", "chip"]:
                 join(uropa_dir, "{PeakTool}", "{name}_{PeakTool}_uropa_{_type}_allhits.txt"),
                 PeakTool=PeakTools, name=chips, _type=peak_types
             ))
-            rule_all_ins.extend(expand(
-                join(uropa_dir, "{name}_{PeakTool}_uropa_{_type}_allhits.txt"), 
-                PeakTool=["DiffbindEdgeR", "DiffbindDeseq2"], 
-                name=contrasts, 
-                _type=peak_types
-            ))
         if contrast:
             rule_all_ins.extend(expand(
-                join(uropa_dir, "promoterTable2", "DiffbindDeseq2_{PeakTool}_promoter_overlap_summaryTable.txt"), 
-                PeakTool=PeakTools
+                join(uropa_diffbind_dir, "{name}_{PeakTool}_uropa_{_type}_allhits.txt"), 
+                PeakTool=["DiffbindEdgeR", "DiffbindDeseq2"], 
+                name=contrasts,
+                _type=["protTSS", "prot", "protSEC", "genes"],
             ))
     else:
         rule_all_ins.extend(expand(
@@ -169,13 +170,12 @@ elif assay in ["atac", "chip"]:
             name=contrasts, 
             _type=UropaCats
         ))
-        # manorm commented turned off now
-        # rule_all_ins.extend(expand(
-        #     join(manorm_dir, "{group1}_vs_{group2}-{tool}", "{group1}_vs_{group2}-{tool}_all_MAvalues.xls"), 
-        #     group1=zipGroup1, 
-        #     group2=zipGroup2, 
-        #     tool=zipToolC
-        # ))
+        rule_all_ins.extend(expand(
+            join(manorm_dir, "{group1}_vs_{group2}-{tool}", "{group1}_vs_{group2}-{tool}_all_MAvalues.xls"), 
+            group1=zipGroup1, 
+            group2=zipGroup2, 
+            tool=zipToolC
+        ))
 rule_all_ins.append(join(workpath, "multiqc_report.html"))
 rule_all_ins.extend(expand(join(qc_dir, "{name}.preseq.dat"), name=samples))
 if has_inputs:
diff --git a/workflow/rules/cfChIP.smk b/workflow/rules/cfChIP.smk
index 665ef2f..334508a 100644
--- a/workflow/rules/cfChIP.smk
+++ b/workflow/rules/cfChIP.smk
@@ -16,6 +16,7 @@ bam_dir                         = join(workpath, "bam")
 cfTool_dir                      = join(workpath, "cfChIPtool")
 cfTool_subdir2                  = join(cfTool_dir, "BED", "H3K4me3")
 qc_dir                          = join(workpath, "QC")
+diffbind_dir                    = join(workpath, "DiffBind")
 
 
 rule cfChIPtool:
@@ -87,7 +88,7 @@ rule promoterTable2:
     input:
         expand(join(diffbind_dir, '{name}_DiffbindDeseq2_uropa_protTSS_allhits.txt'), name=contrasts),
     output:
-        txt                     = join(workpath,uropa_dir,"promoterTable2",'DiffbindDeseq2_{PeakTool}_promoter_overlap_summaryTable.txt'),
+        txt                     = join(uropa_dir, "promoterTable2", 'DiffbindDeseq2_{PeakTool}_promoter_overlap_summaryTable.txt'),
     params:
         rname                   = "promoter2",
         script1                 = join(bin_path, "promoterAnnotation_by_Gene.R"),
diff --git a/workflow/rules/dba.smk b/workflow/rules/dba.smk
index e85fc6e..e1865ac 100644
--- a/workflow/rules/dba.smk
+++ b/workflow/rules/dba.smk
@@ -31,7 +31,6 @@ qc_dir                          = join(workpath, "PeakQC")
 idr_dir                         = join(workpath, "IDR")
 memechip_dir                    = join(workpath, "MEME")
 homer_dir                       = join(workpath, "HOMER_motifs")
-uropa_dir                       = join(workpath, "UROPA_annotations")
 manorm_dir                      = join(workpath, "MANorm")
 downstream_dir                  = join(workpath, "Downstream")
 otherDirs                       = [qc_dir, homer_dir, uropa_dir]
@@ -96,13 +95,13 @@ rule diffbind:
         lambda w: [ join(workpath, w.PeakTool, chip, chip + PeakExtensions[w.PeakTool]) for chip in chips ]
     output:
         html                            = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind.html"),
-        Deseq2                          = join(diffbind_dir, "DiffbindDeseq2", "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2.bed"),
-        EdgeR                           = join(diffbind_dir, "DiffbindEdgeR", "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR.bed"),
-        EdgeR_txt                       = join(diffbind_dir, "DiffbindEdgeR", "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR.txt"),
-        Deseq2_txt                      = join(diffbind_dir, "DiffbindDeseq2", "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2.txt"),
-        EdgeR_ftxt                      = join(diffbind_dir, "DiffbindEdgeR", "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR_fullList.txt"),
+        Deseq2                          = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2.bed"),
+        EdgeR                           = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR.bed"),
+        EdgeR_txt                       = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR.txt"),
+        Deseq2_txt                      = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2.txt"),
+        EdgeR_ftxt                      = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR_fullList.txt"),
         Deseq2_ftxt                     = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2_fullList.txt"),
-        html_block                      = provided(join(diffbind_dir_block, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_blocking.html"), blocking)
+        html_block                      = provided(join(uropa_diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_blocking.html"), blocking)
     params:
         # variables and wildcards used in the shell directive
         rname                           = "diffbind",
@@ -160,7 +159,7 @@ rule diffbind:
 
 rule UROPA:
     input:
-        lambda w: [ join(workpath, w.PeakTool1, w.name, w.name + PeakExtensions[w.PeakTool2]) ],
+        lambda w: join(workpath, w.PeakTool1, w.name, w.name + PeakExtensions[w.PeakTool2]),
     output:
         txt                             = join(uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_allhits.txt'),
         bed1                            = temp(join(uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_allhits.bed')),

From 06fcb3c64a33c70a9018d3391a441332cbcdc7f9 Mon Sep 17 00:00:00 2001
From: Ryan Routsong <routsongrm@nih.gov>
Date: Wed, 31 Jul 2024 18:04:13 -0400
Subject: [PATCH 21/28] fix: realign promotertable cfchip inputs

---
 workflow/Snakefile        | 4 ++--
 workflow/rules/cfChIP.smk | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/workflow/Snakefile b/workflow/Snakefile
index 6f16496..3492aaf 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -96,8 +96,8 @@ if assay == "cfchip":
             group1=zipGroup1, group2=zipGroup2, PeakTool=zipToolC
         ))
         rule_all_ins.extend(expand(
-            join(uropa_diffbind_dir, "{PeakTool}", "{name}_{PeakTool}_uropa_{_type}_allhits.txt"), 
-            PeakTool=['DiffbindEdgeR','DiffbindDeseq2'], 
+            join(uropa_diffbind_dir, "{name}_{PeakTool}_uropa_{_type}_allhits.txt"), 
+            PeakTool=['DiffbindEdgeR', 'DiffbindDeseq2'], 
             name=contrasts,
              _type=["protTSS"]
         ))
diff --git a/workflow/rules/cfChIP.smk b/workflow/rules/cfChIP.smk
index 334508a..7465170 100644
--- a/workflow/rules/cfChIP.smk
+++ b/workflow/rules/cfChIP.smk
@@ -86,7 +86,7 @@ rule promoterTable1:
 
 rule promoterTable2:
     input:
-        expand(join(diffbind_dir, '{name}_DiffbindDeseq2_uropa_protTSS_allhits.txt'), name=contrasts),
+        expand(join(uropa_diffbind_dir, '{name}_DiffbindDeseq2_uropa_protTSS_allhits.txt'), name=contrasts),
     output:
         txt                     = join(uropa_dir, "promoterTable2", 'DiffbindDeseq2_{PeakTool}_promoter_overlap_summaryTable.txt'),
     params:

From 5b9ff2c7e762d9fb4fd8797af350117b470954a4 Mon Sep 17 00:00:00 2001
From: Skyler Kuhn <kuhnsa3@gmail.com>
Date: Fri, 2 Aug 2024 10:52:36 -0400
Subject: [PATCH 22/28] Delete workflow/chrom-seek.code-workspace

---
 workflow/chrom-seek.code-workspace | 13 -------------
 1 file changed, 13 deletions(-)
 delete mode 100644 workflow/chrom-seek.code-workspace

diff --git a/workflow/chrom-seek.code-workspace b/workflow/chrom-seek.code-workspace
deleted file mode 100644
index ba2accd..0000000
--- a/workflow/chrom-seek.code-workspace
+++ /dev/null
@@ -1,13 +0,0 @@
-{
-	"folders": [
-		{
-			"path": ".."
-		},
-		{
-			"path": "../../../../../../data/OpenOmics/dev/datasets"
-		}
-	],
-	"settings": {
-		"r.lsp.promptToInstall": false
-	}
-}
\ No newline at end of file

From 991c620756e465c3dd35d1089d710e3c672c431a Mon Sep 17 00:00:00 2001
From: Ryan Routsong <routsongrm@nih.gov>
Date: Fri, 2 Aug 2024 12:22:38 -0400
Subject: [PATCH 23/28] fix: minor review fixes, reverting some dev settings

---
 src/run.sh                          | 3 +--
 workflow/rules/trim_align_dedup.smk | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/run.sh b/src/run.sh
index 83315ea..3c91146 100755
--- a/src/run.sh
+++ b/src/run.sh
@@ -225,8 +225,7 @@ snakemake --latency-wait 120 -s "$3/workflow/Snakefile" -d "$3" \\
   --use-singularity --singularity-args "'-B $4'" \\
   --use-envmodules --configfile="$3/config.json" \\
   --printshellcmds --cluster-config "$3/config/cluster.json" \\
-  --cluster "${CLUSTER_OPTS}" --keep-going -j 500 \\
-  --keep-incomplete --restart-times 1 \\
+  --cluster "${CLUSTER_OPTS}" --keep-going --restart-times 3 -j 500 \\
   --rerun-incomplete --stats "$3/logfiles/runtime_statistics.json" \\
   --keep-remote --local-cores 14 2>&1
 # Create summary report
diff --git a/workflow/rules/trim_align_dedup.smk b/workflow/rules/trim_align_dedup.smk
index 0b8cc18..52ab127 100644
--- a/workflow/rules/trim_align_dedup.smk
+++ b/workflow/rules/trim_align_dedup.smk
@@ -385,7 +385,7 @@ rule bam2bw:
     shell: 
         """
         if [ ! -d \"""" + tmpdir + """\" ]; then mkdir -p \"""" + tmpdir + """\"; fi
-        tmp=$(mktemp -d -p """ + tmpdir + """)
+        tmp=$(mktemp -d -p \"""" + tmpdir + """\")
         trap 'rm -rf "${{tmp}}"' EXIT
 
         bam_cov_option={input.ppqt}

From a453a6b917b4ddbe132f4cc21ed30425fda8313b Mon Sep 17 00:00:00 2001
From: Ryan Routsong <routsongrm@nih.gov>
Date: Fri, 2 Aug 2024 12:59:34 -0400
Subject: [PATCH 24/28] fix: comment out debug-dag from dryrun snakemake
 execution

---
 src/run.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/run.py b/src/run.py
index 019d39c..9b66aab 100644
--- a/src/run.py
+++ b/src/run.py
@@ -611,7 +611,7 @@ def dryrun(outdir, config='config.json', snakefile=os.path.join('workflow', 'Sna
             'snakemake', '-npr',
             '-s', str(snakefile),
             '--verbose',
-            '--debug-dag',
+            # '--debug-dag',
             '--use-singularity',
             '--rerun-incomplete',
             '--cores', str(256),

From 7cae04e25765559684105cb3855786e1ab2c06ff Mon Sep 17 00:00:00 2001
From: Ryan Routsong <routsongrm@nih.gov>
Date: Tue, 6 Aug 2024 13:52:32 -0400
Subject: [PATCH 25/28] fix: reconfigure a few outputs based on test runs

---
 workflow/Snakefile        | 101 ++++++++++++++++++++------------------
 workflow/rules/cfChIP.smk |   4 +-
 workflow/rules/dba.smk    |  11 ++---
 workflow/rules/qc.smk     |   1 +
 4 files changed, 61 insertions(+), 56 deletions(-)

diff --git a/workflow/Snakefile b/workflow/Snakefile
index 3492aaf..0025ec5 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -86,6 +86,10 @@ if assay == "cfchip":
         PeakTool="DiffBindQC", 
         _type=peak_types
     ))
+    if has_inputs:
+        rule_all_ins.extend(
+            expand(join(qc_dir, "AllSamples-{PeakTool}", "AllSamples-{PeakTool}_DiffBindQC_TMMcounts.bed"), PeakTool=PeakTools)
+        )
     rule_all_ins.extend(expand(
         join(uropa_dir, "promoterTable1", "{PeakTool}_promoter_overlap_summaryTable.txt"), 
         PeakTool=PeakTools
@@ -95,33 +99,35 @@ if assay == "cfchip":
             join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind.html"), 
             group1=zipGroup1, group2=zipGroup2, PeakTool=zipToolC
         ))
-        rule_all_ins.extend(expand(
-            join(uropa_diffbind_dir, "{name}_{PeakTool}_uropa_{_type}_allhits.txt"), 
-            PeakTool=['DiffbindEdgeR', 'DiffbindDeseq2'], 
-            name=contrasts,
-             _type=["protTSS"]
-        ))
+
         if contrast:
+            rule_all_ins.extend(expand(
+                join(uropa_diffbind_dir, "{name}_{PeakTool}_uropa_{_type}_allhits.txt"), 
+                PeakTool=['DiffbindEdgeR', 'DiffbindDeseq2'], 
+                name=contrasts,
+                _type=["protTSS"]
+            ))
             rule_all_ins.extend(expand(
                 join(uropa_dir, "promoterTable2", "DiffbindDeseq2_{PeakTool}_promoter_overlap_summaryTable.txt"), 
                 PeakTool=PeakTools
             ))
     else:
-        rule_all_ins.extend(expand(
-            join(uropa_dir, '{PeakTool}', '{name}_{PeakTool}_uropa_{_type}_allhits.txt'), 
-            PeakTool="MANorm", 
-            name=contrasts, 
-            _type=UropaCats
-        ))
-        rule_all_ins.extend(expand(
-            join(manorm_dir, "{group1}_vs_{group2}-{tool}", "{group1}_vs_{group2}-{tool}_all_MAvalues.xls"), 
-            group1=zipGroup1, 
-            group2=zipGroup2, 
-            tool=zipToolC
-        ))
+        pass
+        # remove manorm for now
+        # rule_all_ins.extend(expand(
+        #     join(uropa_dir, '{PeakTool}', '{name}_{PeakTool}_uropa_{_type}_allhits.txt'), 
+        #     PeakTool="MANorm", 
+        #     name=contrasts, 
+        #     _type=UropaCats
+        # ))
+        # rule_all_ins.extend(expand(
+        #     join(manorm_dir, "{group1}_vs_{group2}-{tool}", "{group1}_vs_{group2}-{tool}_all_MAvalues.xls"), 
+        #     group1=zipGroup1, 
+        #     group2=zipGroup2, 
+        #     tool=zipToolC
+        # ))
 
 elif assay in ["atac", "chip"]:
-    peak_types.extend(["prot", "protSEC", "genes"])
     # meme outputs turned off for now
     # if has_inputs:
     #     rule_all_ins.extend(expand(join(MEME_dir, "{PeakTool}", "{name}_meme", "meme-chip.html"), PeakTool=PeakTools, name=chips))
@@ -146,50 +152,49 @@ elif assay in ["atac", "chip"]:
         rule_all_ins.extend(expand(
             join(genrich_dir, "{name}", "{name}.narrowPeak"), name=chips
         ))
+    rule_all_ins.extend(expand(
+        join(uropa_dir, "{PeakTool}", "{name}_{PeakTool}_uropa_{_type}_allhits.txt"),
+        PeakTool=PeakTools, name=chips, _type=peak_types
+    ))
     if reps:
-        if has_inputs:
-            rule_all_ins.extend(expand(
-                join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind.html"), 
-                group1=zipGroup1, group2=zipGroup2, PeakTool=zipToolC
-            ))
-            rule_all_ins.extend(expand(
-                join(uropa_dir, "{PeakTool}", "{name}_{PeakTool}_uropa_{_type}_allhits.txt"),
-                PeakTool=PeakTools, name=chips, _type=peak_types
-            ))
+        rule_all_ins.extend(expand(
+            join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind.html"), 
+            group1=zipGroup1, group2=zipGroup2, PeakTool=zipToolC
+        ))
         if contrast:
             rule_all_ins.extend(expand(
                 join(uropa_diffbind_dir, "{name}_{PeakTool}_uropa_{_type}_allhits.txt"), 
                 PeakTool=["DiffbindEdgeR", "DiffbindDeseq2"], 
                 name=contrasts,
-                _type=["protTSS", "prot", "protSEC", "genes"],
+                _type=["protTSS"],
             ))
     else:
-        rule_all_ins.extend(expand(
-            join(uropa_dir, '{PeakTool}', '{name}_{PeakTool}_uropa_{_type}_allhits.txt'), 
-            PeakTool="MANorm", 
-            name=contrasts, 
-            _type=UropaCats
-        ))
-        rule_all_ins.extend(expand(
-            join(manorm_dir, "{group1}_vs_{group2}-{tool}", "{group1}_vs_{group2}-{tool}_all_MAvalues.xls"), 
-            group1=zipGroup1, 
-            group2=zipGroup2, 
-            tool=zipToolC
-        ))
+        pass
+        # manorm turned off now
+        # rule_all_ins.extend(expand(
+        #     join(uropa_dir, '{PeakTool}', '{name}_{PeakTool}_uropa_{_type}_allhits.txt'), 
+        #     PeakTool="MANorm", 
+        #     name=contrasts, 
+        #     _type=UropaCats
+        # ))
+        # rule_all_ins.extend(expand(
+        #     join(manorm_dir, "{group1}_vs_{group2}-{tool}", "{group1}_vs_{group2}-{tool}_all_MAvalues.xls"), 
+        #     group1=zipGroup1, 
+        #     group2=zipGroup2, 
+        #     tool=zipToolC
+        # ))
 rule_all_ins.append(join(workpath, "multiqc_report.html"))
 rule_all_ins.extend(expand(join(qc_dir, "{name}.preseq.dat"), name=samples))
-if has_inputs:
-    rule_all_ins.extend(
-        expand(join(qc_dir, "AllSamples-{PeakTool}", "AllSamples-{PeakTool}_DiffBindQC_TMMcounts.bed"), PeakTool=PeakTools)
-    )
-    rule_all_ins.extend(
-        expand(join(peakqc_dir, "{PeakTool}.{name}.Q5DD.FRiP_table.txt"), PeakTool=PeakTools, name=samples)
-    )
+rule_all_ins.extend(
+    expand(join(peakqc_dir, "{PeakTool}.{name}.Q5DD.FRiP_table.txt"), PeakTool=PeakTools, name=samples)
+)
 rule_all_ins.extend(expand(join(bam_dir, "{name}.{ext}"), name=samples, ext=extaln))
 rule_all_ins.extend(expand(join(macsN_dir, "{name}","{name}_peaks.narrowPeak"), name=chips))
 rule_all_ins.extend(expand(join(bw_dir, "{name}.{ext}.RPGC.bw"), name=samples, ext=["sorted", "Q5DD"]))
+
 if has_inputs:
     rule_all_ins.extend(expand(join(bw_dir, "{name}.Q5DD.RPGC.inputnorm.bw"), name=sampleswinput))
+    
 
 rule all:
     input:
diff --git a/workflow/rules/cfChIP.smk b/workflow/rules/cfChIP.smk
index 7465170..7020fba 100644
--- a/workflow/rules/cfChIP.smk
+++ b/workflow/rules/cfChIP.smk
@@ -130,6 +130,6 @@ rule diffbindQC:
             --pc {params.peakcaller} --csv {params.csvfile}
         cp {params.rscript} {params.outdir}
         cd {params.outdir}
-        Rscript -e 'rmarkdown::render("DiffBind_v2_cfChIP_QC.Rmd", output_file= "{output.html}", 
-            params=list(csvfile= "{params.csvfile}", contrasts= "{params.contrast}", peakcaller= "{params.PeakTool}"))'
+        Rscript -e 'rmarkdown::render("DiffBind_v2_cfChIP_QC.Rmd", output_file= "{output.html}",  \
+            params=list(csvfile="{params.csvfile}", contrasts="{params.contrast}", peakcaller="{params.PeakTool}"))'
         """
\ No newline at end of file
diff --git a/workflow/rules/dba.smk b/workflow/rules/dba.smk
index e1865ac..b9aa984 100644
--- a/workflow/rules/dba.smk
+++ b/workflow/rules/dba.smk
@@ -18,8 +18,7 @@ contrast                        = config['project']['contrast']
 uropaver                        = config['tools']['UROPAVER']
 gtf                             = config['references'][genome]['GTFFILE']
 
-# ~~ directories
-bin_path                        = join(workpath, "workflow", "bin")
+# ~~ directoriesxw
 diffbind_dir_block              = join(workpath, "DiffBindBlock")
 diffbind_dir2                   = join(workpath, "DiffBind_block")
 diffbind_dir                    = join(workpath, "DiffBind")
@@ -142,15 +141,15 @@ rule diffbind:
             --pc {params.peakcaller} --csv {params.csvfile}
         cp {params.rscript} {params.outdir}
         cd {params.outdir}
-        Rscript -e 'rmarkdown::render("DiffBind_v2_ChIPseq.Rmd", output_file= "{output.html}", 
-        params=list(csvfile= "{params.csvfile}", contrasts= "{params.this_contrast}", peakcaller= "{params.this_peaktool}"))'
+        Rscript -e 'rmarkdown::render("DiffBind_v2_ChIPseq.Rmd", output_file= "{output.html}", \
+            params=list(csvfile="{params.csvfile}", contrasts="{params.this_contrast}", peakcaller="{params.this_peaktool}"))'
         if [ ! -f {output.Deseq2} ]; then touch {output.Deseq2}; fi
         if [ ! -f {output.EdgeR} ]; then touch {output.EdgeR}; fi
 
         if [ '"""+str(blocking)+"""' == True ]; then
             echo "DiffBind with Blocking"
-            Rscript -e 'rmarkdown::render("{params.blocking_rscript}", output_file= "{output.html_block}", 
-            params=list(csvfile= "{params.csvfile}", contrasts= "{params.this_contrast}", peakcaller= "{params.this_peaktool}", dir= "{params.outdir_block}"))'
+            Rscript -e 'rmarkdown::render("{params.blocking_rscript}", output_file= "{output.html_block}", \
+                params=list(csvfile= "{params.csvfile}", contrasts="{params.this_contrast}", peakcaller="{params.this_peaktool}", dir="{params.outdir_block}"))'
             if [ ! -f {params.Deseq2_block} ]; then touch {params.Deseq2_block}; fi
             if [ ! -f {params.EdgeR_block} ]; then touch {params.EdgeR_block}; fi
         fi
diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk
index 61b02ea..82b4376 100644
--- a/workflow/rules/qc.smk
+++ b/workflow/rules/qc.smk
@@ -185,6 +185,7 @@ rule fastqc:
             -exec cp {{}} {params.outdir} \\;
         """
 
+
 rule fastq_screen:
     """
     Quality-control step to screen for different sources of contamination.

From fd490e94bac91cab65f7ce1ac0aee6cc27e65116 Mon Sep 17 00:00:00 2001
From: Ryan Routsong <routsongrm@nih.gov>
Date: Tue, 6 Aug 2024 13:53:03 -0400
Subject: [PATCH 26/28] fix: refactor diffbind prep script, add error case
 exception for umap in diffbindqc rmd

---
 bin/DiffBind_v2_cfChIP_QC.Rmd |  16 +++++-
 bin/prep_diffbindQC.py        | 105 +++++++++++++++++++---------------
 2 files changed, 71 insertions(+), 50 deletions(-)

diff --git a/bin/DiffBind_v2_cfChIP_QC.Rmd b/bin/DiffBind_v2_cfChIP_QC.Rmd
index d058cec..6f6731b 100755
--- a/bin/DiffBind_v2_cfChIP_QC.Rmd
+++ b/bin/DiffBind_v2_cfChIP_QC.Rmd
@@ -146,7 +146,10 @@ try(dba.plotPCA(DBdataCounts),silent=TRUE)
 ```{r TMM}
 vec <- c("seqnames", "start", "end", "width", "strand", samples$samples$SampleID)
 consensus2 <- dba.peakset(DBdataCounts, bRetrieve=TRUE) %>% ##extracts TMM-normalized  counts
-  as.data.frame() %>% setNames(vec) %>% arrange(start, end) %>% mutate(Peaks = paste0("Peak",1:nrow(.))) %>% 
+  as.data.frame() %>% 
+  setNames(vec) %>% 
+  arrange(start, end) %>% 
+  mutate(Peaks = paste0("Peak",1:nrow(.))) %>% 
   dplyr::select(1:4, Peaks, samples$samples$SampleID)
 
 outfile1 <- paste0(contrasts, "-", peakcaller, "_DiffBindQC_TMMcounts.csv")
@@ -164,12 +167,19 @@ counts_TMM_ALL <- counts_TMM_ALL %>% dplyr::select(5:ncol(.)) %>%
   t() %>% log10() %>% as.data.frame(.)
 ##UMAP coordinates
 set.seed(123)
+
 if (nrow(samples$samples) < 16) {
-  umap_coord <- umap(counts_TMM_ALL, n_neighbors= nrow(samples$samples)-1)
+  neighbors=nrow(samples$samples)-1
+  if (neighbors > 1) {
+    umap_coord <- umap(counts_TMM_ALL, n_neighbors=neighbors)
+  } else {
+    umap_coord <- umap(counts_TMM_ALL, n_neighbors=2)
+  }
 } else {
   umap_coord <- umap(counts_TMM_ALL)
 }
-umap_coord <-as.data.frame(umap_coord$layout) %>% setNames(c("UMAP1", "UMAP2"))
+umap_coord <- as.data.frame(umap_coord$layout) %>% 
+              setNames(c("UMAP1", "UMAP2"))
 
 outfile <- paste0(contrasts, "-", peakcaller, "_DiffBindQC_UMAP.csv")
 write.csv(umap_coord, outfile, row.names = F)
diff --git a/bin/prep_diffbindQC.py b/bin/prep_diffbindQC.py
index 59c19d3..b01b7a5 100755
--- a/bin/prep_diffbindQC.py
+++ b/bin/prep_diffbindQC.py
@@ -2,50 +2,61 @@
 
 import json
 import argparse
-
-parser = argparse.ArgumentParser(description='Script to prepare the DiffBind input csv')
-parser.add_argument('--wp',dest='workpath',required=True,help='Full path of the working directory')
-parser.add_argument('--pt',dest='peaktool',required=True,help='Name of the the peak calling tool, also the directory where the peak file will be located')
-parser.add_argument('--pe',dest='peakextension',required=True,help='The file extension of the peakcall output')
-parser.add_argument('--pc',dest='peakcaller',required=True,help='Value for the PeakCaller column of the DiffBind csv')
-parser.add_argument('--bd',dest='bamdir',required=True,help='Name of the directory where the bam files are located')
-parser.add_argument('--csv',dest='csvfile',required=True,help='Name of the output csv file')
-
-args = parser.parse_args()
-
-with open("config.json","r") as read_file:
-   config=json.load(read_file)
-   
-chip2input = config['project']['peaks']['inputs']
-groupdata = config['project']['groups']
-
-tmpIDs = [x for xs in groupdata.values() for x in xs]
-Ncounts = [tmpIDs.count(tmp) for tmp in set(tmpIDs)]
-
-samplesheet = [",".join(["SampleID","Condition", "Replicate", "bamReads", 
-         "ControlID", "bamControl", "Peaks", "PeakCaller"])]
-
-count = 1
-for chip in chip2input.keys():
-   if set(Ncounts) == {1}: # if all samples only in one group
-      for key in groupdata.keys():
-          if chip in groupdata[key]:
-             condition = key
-      replicate = str([ i + 1 for i in range(len(groupdata[condition])) if groupdata[condition][i]== chip ][0])
-   else:
-      condition = ""
-      replicate = str(count)
-      count = count +1
-   bamReads = args.bamdir + "/" + chip + ".Q5DD.bam"
-   controlID = chip2input[chip]
-   if controlID != "":
-      bamControl = args.bamdir + "/" +  controlID + ".Q5DD.bam"
-   else:
-      bamControl = ""
-   peaks = args.workpath + "/" + args.peaktool + "/" + chip + "/" + chip + args.peakextension
-   samplesheet.append(",".join([chip, condition, replicate, bamReads, 
-                   controlID, bamControl, peaks, args.peakcaller]))            
-
-f = open(args.csvfile, 'w')
-f.write ("\n".join(samplesheet))
-f.close()
+import csv
+from os.path import join
+
+
+def main(args):
+   with open(join(args.workpath, "config.json"), "r") as read_file:
+      config=json.load(read_file)
+      
+   chip2input = config['project']['peaks']['inputs']
+   groupdata = config['project']['groups']
+
+   tmpIDs = [x for xs in groupdata.values() for x in xs]
+   Ncounts = [tmpIDs.count(tmp) for tmp in set(tmpIDs)]
+
+   with open(args.csvfile, 'w') as csvfile:
+      columns = ["SampleID","Condition", "Replicate", "bamReads", 
+               "ControlID", "bamControl", "Peaks", "PeakCaller"]
+      writer = csv.DictWriter(csvfile, fieldnames=columns)
+      writer.writeheader()
+
+      count = 1
+      for chip in chip2input.keys():
+         if set(Ncounts) == {1}: # if all samples only in one group
+            for key in groupdata.keys():
+               if chip in groupdata[key]:
+                  condition = key
+            replicate = str([ i + 1 for i in range(len(groupdata[condition])) if groupdata[condition][i]== chip ][0])
+         else:
+            condition = ""
+            replicate = str(count)
+            count = count +1
+         bamReads = args.bamdir + "/" + chip + ".Q5DD.bam"
+         controlID = chip2input[chip]
+         if controlID != "":
+            bamControl = args.bamdir + "/" +  controlID + ".Q5DD.bam"
+         else:
+            bamControl = ""
+         peaks = args.workpath + "/" + args.peaktool + "/" + chip + "/" + chip + args.peakextension
+         row_values = [chip, condition, replicate, bamReads, controlID, bamControl, peaks, args.peakcaller]
+         writer.writerow(dict(zip(columns, row_values)))
+
+
+if __name__ == "__main__":
+   parser = argparse.ArgumentParser(description='Script to prepare the DiffBind input csv')
+   parser.add_argument('--wp', dest='workpath', required=True, 
+                       help='Full path of the working directory')
+   parser.add_argument('--pt', dest='peaktool', required=True, 
+                       help='Name of the the peak calling tool, also the directory where the peak file will be located')
+   parser.add_argument('--pe', dest='peakextension', required=True, 
+                       help='The file extension of the peakcall output')
+   parser.add_argument('--pc', dest='peakcaller', required=True, 
+                       help='Value for the PeakCaller column of the DiffBind csv')
+   parser.add_argument('--bd', dest='bamdir', required=True, 
+                       help='Name of the directory where the bam files are located')
+   parser.add_argument('--csv', dest='csvfile', required=True, 
+                       help='Name of the output csv file')
+
+   main(parser.parse_args())

From 73076f82db10a2a131ded571a6ac3e200945cfd0 Mon Sep 17 00:00:00 2001
From: Tovah Markowitz <markowitzte@nih.gov>
Date: Wed, 7 Aug 2024 10:24:53 -0400
Subject: [PATCH 27/28] Update prep_diffbind.py

Adding ending carriage return to output files
---
 bin/prep_diffbind.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/bin/prep_diffbind.py b/bin/prep_diffbind.py
index fc96cd7..3711c68 100755
--- a/bin/prep_diffbind.py
+++ b/bin/prep_diffbind.py
@@ -51,4 +51,5 @@
 
 f = open(args.csvfile, 'w')
 f.write ("\n".join(samplesheet))
+f.write ("\n")
 f.close()

From 3aef261863ff90d4635fce6d265270314dce0a6f Mon Sep 17 00:00:00 2001
From: Ryan Routsong <routsongrm@nih.gov>
Date: Wed, 7 Aug 2024 12:00:31 -0400
Subject: [PATCH 28/28] fix: add blocking/control functions to grouping header

---
 workflow/rules/dba.smk              |  2 +-
 workflow/rules/trim_align_dedup.smk |  4 ++--
 workflow/scripts/blocking.py        | 34 -----------------------------
 workflow/scripts/grouping.py        | 26 ++++++++++++++++++++++
 4 files changed, 29 insertions(+), 37 deletions(-)
 delete mode 100644 workflow/scripts/blocking.py

diff --git a/workflow/rules/dba.smk b/workflow/rules/dba.smk
index b9aa984..c6f0b5e 100644
--- a/workflow/rules/dba.smk
+++ b/workflow/rules/dba.smk
@@ -5,7 +5,7 @@ import json
 from os.path import join
 from scripts.common import allocated, mk_dir_if_not_exist
 from scripts.peakcall import outputIDR, zip_peak_files, calc_effective_genome_fraction, get_manorm_sizes
-from scripts.blocking import test_for_block
+from scripts.grouping import test_for_block
 
 
 # ~~ workflow configuration
diff --git a/workflow/rules/trim_align_dedup.smk b/workflow/rules/trim_align_dedup.smk
index 52ab127..0642ce0 100644
--- a/workflow/rules/trim_align_dedup.smk
+++ b/workflow/rules/trim_align_dedup.smk
@@ -4,8 +4,8 @@
 import snakemake
 from os.path import join
 from scripts.common import allocated, get_bam_ext
-from scripts.grouping import dedup_out7, get_bam_input, get_ppqt_input
-from scripts.blocking import ctrl_test
+from scripts.grouping import dedup_out7, get_bam_input, get_ppqt_input, \
+                             ctrl_test
 
 # ~~ workflow configuration
 workpath                        = config['project']['workpath']
diff --git a/workflow/scripts/blocking.py b/workflow/scripts/blocking.py
deleted file mode 100644
index 270376b..0000000
--- a/workflow/scripts/blocking.py
+++ /dev/null
@@ -1,34 +0,0 @@
-#!/usr/bin/env python3
-import os
-from os.path import join
-from collections import defaultdict
-
-
-# ~~~ Common helper functions for blocking or controls
-def test_for_block(groupdata, contrast, blocks):
-    """ only want to run blocking on contrasts where all
-    individuals are on both sides of the contrast """
-    contrastBlock = [ ]
-    for con in contrast:
-        group1 = con[0]
-        group2 = con[1]
-        block1 = [ blocks[sample] for sample in groupdata[group1] ]
-        block2 = [ blocks[sample] for sample in groupdata[group2] ]
-        if len(block1) == len(block2):
-            if len(set(block1).intersection(block2)) == len(block1):
-                contrastBlock.append(con)
-    return contrastBlock
-
-
-def ctrl_test(ctrl_dict, input_name, in_dir, mode=None):
-    sample = join(in_dir, f"{input_name}.Q5DD.RPGC.bw")
-    assert mode in ('chip', 'ctrl'), 'Unrecognized input file mode.'
-    # assert os.path.exists(sample), f'{sample} sample does not exist!'
-    
-    if input_name in ctrl_dict:
-        norm = join(in_dir, ctrl_dict[input_name] + ".Q5DD.RPGC.bw")
-        # assert os.path.exists(norm), f'{norm} control does not exist!'
-    else:
-        raise ValueError(f'ChIP sample {input_name} missing from input lookup: \n{str(ctrl_dict)}')
-    outs = {'chip': sample, 'ctrl': norm}
-    return outs[mode]
\ No newline at end of file
diff --git a/workflow/scripts/grouping.py b/workflow/scripts/grouping.py
index 8d78899..ee02b3e 100644
--- a/workflow/scripts/grouping.py
+++ b/workflow/scripts/grouping.py
@@ -110,3 +110,29 @@ def get_bam_input(bam_dir, wildcards, paired_end):
             bams.append(join(bam_dir, "{0}.sorted.bam".format(wildcards.name)))
     return bams
 
+
+def test_for_block(groupdata, contrast, blocks):
+    """ only want to run blocking on contrasts where all
+    individuals are on both sides of the contrast """
+    contrastBlock = [ ]
+    for con in contrast:
+        group1 = con[0]
+        group2 = con[1]
+        block1 = [ blocks[sample] for sample in groupdata[group1] ]
+        block2 = [ blocks[sample] for sample in groupdata[group2] ]
+        if len(block1) == len(block2):
+            if len(set(block1).intersection(block2)) == len(block1):
+                contrastBlock.append(con)
+    return contrastBlock
+
+
+def ctrl_test(ctrl_dict, input_name, in_dir, mode=None):
+    sample = join(in_dir, f"{input_name}.Q5DD.RPGC.bw")
+    assert mode in ('chip', 'ctrl'), 'Unrecognized input file mode.'
+    
+    if input_name in ctrl_dict:
+        norm = join(in_dir, ctrl_dict[input_name] + ".Q5DD.RPGC.bw")
+    else:
+        raise ValueError(f'ChIP sample {input_name} missing from input lookup: \n{str(ctrl_dict)}')
+    outs = {'chip': sample, 'ctrl': norm}
+    return outs[mode]
\ No newline at end of file