OpenOmics · rroutsong · Aug 7, 2024 · Jun 27, 2024 · Jul 5, 2024 · Jul 8, 2024
diff --git a/workflow/scripts/DiffBind_v2_ChIPseq.Rmd → bin/DiffBind_v2_ChIPseq.Rmd b/workflow/scripts/DiffBind_v2_ChIPseq.Rmd → bin/DiffBind_v2_ChIPseq.Rmd
diff --git a/...low/scripts/DiffBind_v2_ChIPseq_block.Rmd → bin/DiffBind_v2_ChIPseq_block.Rmd b/...low/scripts/DiffBind_v2_ChIPseq_block.Rmd → bin/DiffBind_v2_ChIPseq_block.Rmd
diff --git a/workflow/scripts/DiffBind_v2_cfChIP_QC.Rmd → bin/DiffBind_v2_cfChIP_QC.Rmd b/workflow/scripts/DiffBind_v2_cfChIP_QC.Rmd → bin/DiffBind_v2_cfChIP_QC.Rmd
@@ -146,7 +146,10 @@ try(dba.plotPCA(DBdataCounts),silent=TRUE)
 ```{r TMM}
 vec <- c("seqnames", "start", "end", "width", "strand", samples$samples$SampleID)
 consensus2 <- dba.peakset(DBdataCounts, bRetrieve=TRUE) %>% ##extracts TMM-normalized  counts
-  as.data.frame() %>% setNames(vec) %>% arrange(start, end) %>% mutate(Peaks = paste0("Peak",1:nrow(.))) %>% 
+  as.data.frame() %>% 
+  setNames(vec) %>% 
+  arrange(start, end) %>% 
+  mutate(Peaks = paste0("Peak",1:nrow(.))) %>% 
   dplyr::select(1:4, Peaks, samples$samples$SampleID)
 
 outfile1 <- paste0(contrasts, "-", peakcaller, "_DiffBindQC_TMMcounts.csv")
@@ -164,12 +167,19 @@ counts_TMM_ALL <- counts_TMM_ALL %>% dplyr::select(5:ncol(.)) %>%
   t() %>% log10() %>% as.data.frame(.)
 ##UMAP coordinates
 set.seed(123)
+
 if (nrow(samples$samples) < 16) {
-  umap_coord <- umap(counts_TMM_ALL, n_neighbors= nrow(samples$samples)-1)
+  neighbors=nrow(samples$samples)-1
+  if (neighbors > 1) {
+    umap_coord <- umap(counts_TMM_ALL, n_neighbors=neighbors)
+  } else {
+    umap_coord <- umap(counts_TMM_ALL, n_neighbors=2)
+  }
 } else {
   umap_coord <- umap(counts_TMM_ALL)
 }
-umap_coord <-as.data.frame(umap_coord$layout) %>% setNames(c("UMAP1", "UMAP2"))
+umap_coord <- as.data.frame(umap_coord$layout) %>% 
+              setNames(c("UMAP1", "UMAP2"))
 
 outfile <- paste0(contrasts, "-", peakcaller, "_DiffBindQC_UMAP.csv")
 write.csv(umap_coord, outfile, row.names = F)

diff --git a/workflow/scripts/FRiP_plot.R → bin/FRiP_plot.R b/workflow/scripts/FRiP_plot.R → bin/FRiP_plot.R
diff --git a/workflow/scripts/atac_nrf.py → bin/atac_nrf.py b/workflow/scripts/atac_nrf.py → bin/atac_nrf.py
diff --git a/workflow/scripts/bam_filter_by_mapq.py → bin/bam_filter_by_mapq.py b/workflow/scripts/bam_filter_by_mapq.py → bin/bam_filter_by_mapq.py
diff --git a/workflow/scripts/cfChIP_signatures.R → bin/cfChIP_signatures.R b/workflow/scripts/cfChIP_signatures.R → bin/cfChIP_signatures.R
diff --git a/workflow/scripts/createtable → bin/creattable.py b/workflow/scripts/createtable → bin/creattable.py
diff --git a/workflow/scripts/filterMetrics → bin/filterMetrics.py b/workflow/scripts/filterMetrics → bin/filterMetrics.py
@@ -60,9 +60,9 @@ def getmetadata(type):
 	elif type == 'tnreads':
 		metadata = 'NReads'
 	elif type == 'mnreads':
-                metadata = 'NMappedReads'
+		metadata = 'NMappedReads'
 	elif type == 'unreads':
-                metadata = 'NUniqMappedReads'
+		metadata = 'NUniqMappedReads'
 	elif type == 'fragLen':
 		metadata = 'FragmentLength'
 	return metadata
@@ -88,11 +88,11 @@ def filteredData(sample, ftype):
 			extenders = []
 			for ppqt_value in linelist:
 				if int(ppqt_value) > 150:
-                            		extenders.append(ppqt_value)
+					extenders.append(ppqt_value)
 			if len(extenders) > 0:
 				print("{}\t{}\t{}".format(sample, mtypes, extenders[0]))
 			else:
-                        	print("{}\t{}\t{}".format(sample, mtypes, linelist[0]))
+				print("{}\t{}\t{}".format(sample, mtypes, linelist[0]))
 		elif ftype == 'ppqt' or ftype == 'ngsqc' or ftype == 'nrf':
 			mtypes = getmetadata(ftype)
 			for i in range(len(linelist)):

diff --git a/workflow/scripts/frip.py → bin/frip.py b/workflow/scripts/frip.py → bin/frip.py
diff --git a/workflow/scripts/jaccard_score.py → bin/jaccard_score.py b/workflow/scripts/jaccard_score.py → bin/jaccard_score.py
diff --git a/workflow/scripts/jobby → bin/jobby b/workflow/scripts/jobby → bin/jobby
diff --git a/workflow/scripts/ppqt/LICENSE → bin/ppqt/LICENSE b/workflow/scripts/ppqt/LICENSE → bin/ppqt/LICENSE
diff --git a/workflow/scripts/ppqt/README.txt → bin/ppqt/README.txt b/workflow/scripts/ppqt/README.txt → bin/ppqt/README.txt
diff --git a/...cripts/ppqt/peakCallingPipelineForIdr.txt → bin/ppqt/peakCallingPipelineForIdr.txt b/...cripts/ppqt/peakCallingPipelineForIdr.txt → bin/ppqt/peakCallingPipelineForIdr.txt
diff --git a/workflow/scripts/ppqt/run_spp.R → bin/ppqt/run_spp.R b/workflow/scripts/ppqt/run_spp.R → bin/ppqt/run_spp.R
diff --git a/workflow/scripts/ppqt/run_spp_nodups.R → bin/ppqt/run_spp_nodups.R b/workflow/scripts/ppqt/run_spp_nodups.R → bin/ppqt/run_spp_nodups.R
diff --git a/workflow/scripts/ppqt/spp_1.10.1.tar.gz → bin/ppqt/spp_1.10.1.tar.gz b/workflow/scripts/ppqt/spp_1.10.1.tar.gz → bin/ppqt/spp_1.10.1.tar.gz
diff --git a/workflow/scripts/ppqt_process.py → bin/ppqt_process.py b/workflow/scripts/ppqt_process.py → bin/ppqt_process.py
diff --git a/workflow/scripts/prep_diffbind.py → bin/prep_diffbind.py b/workflow/scripts/prep_diffbind.py → bin/prep_diffbind.py
@@ -23,20 +23,20 @@
 blocks = config['project']['blocks']
 
 if None in list(blocks.values()):
-    samplesheet = [",".join(["SampleID","Condition", "Replicate", "bamReads", 
+    samplesheet = [",".join(["SampleID", "Condition", "Replicate", "bamReads", 
          "ControlID", "bamControl", "Peaks", "PeakCaller"])]
 else:
-    samplesheet = [",".join(["SampleID","Condition","Treatment","Replicate", "bamReads", 
+    samplesheet = [",".join(["SampleID", "Condition", "Treatment", "Replicate", "bamReads", 
          "ControlID", "bamControl", "Peaks", "PeakCaller"])]
 
 
 for condition in args.group1, args.group2:
     for chip in groupdata[condition]:
         replicate = str([ i + 1 for i in range(len(groupdata[condition])) if groupdata[condition][i]== chip ][0])
-        bamReads = args.workpath + "/" + args.bamdir + "/" + chip + ".Q5DD.bam"
+        bamReads = args.bamdir + "/" + chip + ".Q5DD.bam"
         controlID = chip2input[chip]
         if controlID != "":
-            bamControl = args.workpath + "/" + args.bamdir + "/" +  controlID + ".Q5DD.bam"
+            bamControl = args.bamdir + "/" +  controlID + ".Q5DD.bam"
         else:
             bamControl = ""
         peaks = args.workpath + "/" + args.peaktool + "/" + chip + "/" + chip + args.peakextension

diff --git a/bin/prep_diffbindQC.py b/bin/prep_diffbindQC.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python3
+
+import json
+import argparse
+import csv
+from os.path import join
+
+
+def main(args):
+   with open(join(args.workpath, "config.json"), "r") as read_file:
+      config=json.load(read_file)
+
+   chip2input = config['project']['peaks']['inputs']
+   groupdata = config['project']['groups']
+
+   tmpIDs = [x for xs in groupdata.values() for x in xs]
+   Ncounts = [tmpIDs.count(tmp) for tmp in set(tmpIDs)]
+
+   with open(args.csvfile, 'w') as csvfile:
+      columns = ["SampleID","Condition", "Replicate", "bamReads", 
+               "ControlID", "bamControl", "Peaks", "PeakCaller"]
+      writer = csv.DictWriter(csvfile, fieldnames=columns)
+      writer.writeheader()
+
+      count = 1
+      for chip in chip2input.keys():
+         if set(Ncounts) == {1}: # if all samples only in one group
+            for key in groupdata.keys():
+               if chip in groupdata[key]:
+                  condition = key
+            replicate = str([ i + 1 for i in range(len(groupdata[condition])) if groupdata[condition][i]== chip ][0])
+         else:
+            condition = ""
+            replicate = str(count)
+            count = count +1
+         bamReads = args.bamdir + "/" + chip + ".Q5DD.bam"
+         controlID = chip2input[chip]
+         if controlID != "":
+            bamControl = args.bamdir + "/" +  controlID + ".Q5DD.bam"
+         else:
+            bamControl = ""
+         peaks = args.workpath + "/" + args.peaktool + "/" + chip + "/" + chip + args.peakextension
+         row_values = [chip, condition, replicate, bamReads, controlID, bamControl, peaks, args.peakcaller]
+         writer.writerow(dict(zip(columns, row_values)))
+
+
+if __name__ == "__main__":
+   parser = argparse.ArgumentParser(description='Script to prepare the DiffBind input csv')
+   parser.add_argument('--wp', dest='workpath', required=True, 
+                       help='Full path of the working directory')
+   parser.add_argument('--pt', dest='peaktool', required=True, 
+                       help='Name of the the peak calling tool, also the directory where the peak file will be located')
+   parser.add_argument('--pe', dest='peakextension', required=True, 
+                       help='The file extension of the peakcall output')
+   parser.add_argument('--pc', dest='peakcaller', required=True, 
+                       help='Value for the PeakCaller column of the DiffBind csv')
+   parser.add_argument('--bd', dest='bamdir', required=True, 
+                       help='Name of the directory where the bam files are located')
+   parser.add_argument('--csv', dest='csvfile', required=True, 
+                       help='Name of the output csv file')
+
+   main(parser.parse_args())
diff --git a/...flow/scripts/promoterAnnotation_by_Gene.R → bin/promoterAnnotation_by_Gene.R b/...flow/scripts/promoterAnnotation_by_Gene.R → bin/promoterAnnotation_by_Gene.R
diff --git a/workflow/scripts/significantPathways.R → bin/significantPathways.R b/workflow/scripts/significantPathways.R → bin/significantPathways.R
diff --git a/config/containers.json b/config/containers.json
@@ -1,7 +1,7 @@
 {
     "images": {
-        "cfchip": "docker://skchronicles/cfchip_toolkit_v0.5.0",
-        "python": "docker://asyakhleborodova/chrom_seek_python_v0.1.0",
+        "cfchip": "docker://skchronicles/cfchip_toolkit:v0.5.0",
+        "python": "docker://asyakhleborodova/chrom_seek_python:v0.1.0",
         "ppqt": "docker://asyakhleborodova/ppqt:v0.2.0"
     }
 }
diff --git a/src/run.py b/src/run.py
@@ -19,7 +19,7 @@
 from . import version as __version__
 
 
-def init(repo_path, output_path, links=[], required=['workflow', 'resources', 'config']):
+def init(repo_path, output_path, links=[], required=['workflow', 'bin', 'resources', 'config']):
     """Initialize the output directory. If user provides a output
     directory path that already exists on the filesystem as a file 
     (small chance of happening but possible), a OSError is raised. If the
@@ -207,6 +207,7 @@ def setup(sub_args, ifiles, repo_path, output_path):
     # Add other runtime info for debugging
     config['project']['version'] = __version__
     config['project']['workpath'] = os.path.abspath(sub_args.output)
+    config['project']['binpath'] = os.path.abspath(os.path.join(config['project']['workpath'], 'bin'))
     git_hash = git_commit_hash(repo_path)
     config['project']['git_commit_hash'] = git_hash   # Add latest git commit hash
     config['project']['pipeline_path'] = repo_path    # Add path to installation
@@ -221,7 +222,8 @@ def setup(sub_args, ifiles, repo_path, output_path):
             v = str(v)
         config['options'][opt] = v
 
-
+    # initiate a few workflow vars
+    config['options']['peak_type_base'] = ["protTSS"]
     return config
 
 
@@ -608,6 +610,8 @@ def dryrun(outdir, config='config.json', snakefile=os.path.join('workflow', 'Sna
         dryrun_output = subprocess.check_output([
             'snakemake', '-npr',
             '-s', str(snakefile),
+            '--verbose',
+            # '--debug-dag',
             '--use-singularity',
             '--rerun-incomplete',
             '--cores', str(256),

diff --git a/src/run.sh b/src/run.sh
@@ -209,7 +209,6 @@ function submit(){
           if [[ ${6#\'} != /lscratch* ]]; then
             CLUSTER_OPTS="sbatch --cpus-per-task {cluster.threads} -p {cluster.partition} -t {cluster.time} --mem {cluster.mem} --job-name={params.rname} -e $SLURM_DIR/slurm-%j_{params.rname}.out -o $SLURM_DIR/slurm-%j_{params.rname}.out {cluster.ntasks} {cluster.ntasks_per_core} {cluster.exclusive}"
           fi
-          # Create sbacth script to build index
     cat << EOF > kickoff.sh
 #!/usr/bin/env bash
 #SBATCH --cpus-per-task=16