diff --git a/build/scripts-3.10/4CPre.py b/build/scripts-3.10/4CPre.py new file mode 100755 index 00000000..426eca46 --- /dev/null +++ b/build/scripts-3.10/4CPre.py @@ -0,0 +1,417 @@ +#!/home/caoy7/anaconda3/envs/astroBoy/bin/python +#--coding:utf-8-- +""" +4CPre.py +Pre-processing code for 4C-seq data, implemented with cLoops2, from fastq to fragments and viewpoint bedgraph files. +2022-03-11: finished and well tested. +""" + +__author__ = "CAO Yaqiang" +__email__ = "caoyaqiang0410@gmail.com" + +#systematic library +import os +import time +import gzip +import random +import argparse +import subprocess +from glob import glob +from datetime import datetime +from argparse import RawTextHelpFormatter + +#3rd library +import HTSeq +import numpy as np +import pandas as pd +from joblib import Parallel, delayed +from Bio.Seq import Seq +from Bio.SeqIO.QualityIO import FastqGeneralIterator + +#cLoops2 +from cLoops2.utils import getLogger, callSys, isTool + +#global settings +#logger +date = time.strftime(' %Y-%m-%d', time.localtime(time.time())) +logger = getLogger(fn=os.getcwd() + "/" + date.strip() + "_" + + os.path.basename(__file__) + ".log") + + +def help(): + """ + Create the command line interface for the script. + """ + description = """ + Preprocess the raw reads of FASTQ files of 4C-seq to reference + geneome with bowtie2 and obtain the unqiue PETs with quality control + results. + + Example: + 4CPre.py -fq test -o test -ref ../bowtie2/hg38 -p 5 -mapq 10 + """ + parser = argparse.ArgumentParser(description=description, + formatter_class=RawTextHelpFormatter) + parser.add_argument("-fq", + dest="fq", + required=True, + type=str, + help="The raw .fastq.gz files.") + parser.add_argument( + "-o", + dest="output", + required=False, + type=str, + default="4C", + help= + "Output directory, default is 4C, if directory not exists, create one." + ) + parser.add_argument( + "-ref", + dest="ref", + required=True, + type=str, + help= + "Bowtie2 reference index prefix, such as ./ref/hg38, generated from\n"\ + "bowtie2-build hg38.fa hg38." + ) + parser.add_argument( + "-bait", + dest="bait", + required=True, + type=str, + help= + "Bait sequence designed for the view point." + ) + parser.add_argument( + "-ligationSite", + dest="ligationSite", + required=True, + type=str, + help= + "Ligation site for bait and target sequence. For example, if MboI used, set as GATC." + ) + parser.add_argument( + "-genomeFrag", + dest="genomeFrag", + required=True, + type=str, + help= + "Genome fragment in bed format. Output of digest_genome.py from HiC-Pro." + ) + + parser.add_argument( + "-p", + dest="cpu", + required=False, + type=int, + default=5, + help="How many cpus used by Bowtie2, default is 5." + ) + parser.add_argument( + "-mapq", + dest="mapq", + required=False, + default=10, + type=int, + help="MAPQ cutoffs for filtering mapped reads, default is 10." + ) + parser.add_argument( + "-cis", + dest="cis", + required=False, + default=False, + action="store_true", + help= + "Whether to only keep intra-chromosomal reads with the bait. The\n"\ + "default is to keep all. " + ) + parser.add_argument( + "-log", + dest="log", + required=False, + default=False, + action="store_true", + help= + "Whether to log2 transform the bedGraph signal. Set this to do log2." + ) + op = parser.parse_args() + return op + + +def getBaitPos(bait, ref): + """ + Get the genomic position for the bait sequence. + """ + n = str(random.random())+".fa" + with open(n,"w") as fo: + fo.write(">bait\n"+bait+"\n") + doBowtie = "bowtie2 --quiet --no-head --no-sq -f --end-to-end -x {ref} -U {fa}".format(ref=ref, fa=n) + status, output = subprocess.getstatusoutput(doBowtie) + os.system("rm %s"%n) + output = output.split("\n")[0].split("\t") + if output[1] == "16": + strand = "-" + else: + strand = "+" + chrom = output[2] + pos = output[3] + return chrom, pos, strand + + +def match(sa, sb, miss=2): + s = 0 + for i in range(len(sa)): + if sa[i] != sb[i]: + s += 1 + if s > miss: + return False + else: + return True + + +def parseSeq(fin, fo, bait, enz, miss=2, rlen=10): + tot = 0 + q = 0 + with gzip.open(fo, "wt") as fout: + with gzip.open(fin, "rt") as f: + for r in FastqGeneralIterator(f): + r = list(r) + tot += 1 + #if tot % 100000 == 0: + # print("%s reads processed for %s" % (tot, fo)) + s = r[1][:len(bait)] + m = match(bait, s, miss) + if m == False: + continue + flag = False + for i in range(len(bait), len(r[1])): + if r[1][i:i + len(enz)] == enz: + pos = i + len(enz) + 1 + if len(r[1]) - pos > rlen: + flag = True + break + if flag == False: + continue + r[1] = r[1][pos:] + r[2] = r[2][pos:] + q += 1 + fout.write("@%s\n%s\n+\n%s\n" % (r[0], r[1], r[2])) + return tot, q + + +def sam2bam(sam, bam): + """ + SAM to BAM file + """ + samview = "samtools view -S %s -b -o %s" % (sam, bam) + samsort = "samtools sort -@ 2 {bam} -T {pre} -o {bam}".format( + bam=bam, pre=bam.replace(".bam", "")) + samindex = "samtools index {bam} {bai}".format(bam=bam, + bai=bam.replace( + ".bam", ".bai")) + rmsam = "rm %s" % (sam) + cmds = [samview, samsort, samindex, rmsam] + callSys(cmds, logger) + + +def doMap(fq, ref,sam,bam, cpus=5): + #doBowtie = "bowtie2 --no-mixed --no-discordant -p {cpus} -q --local --very-sensitive -x {ref} {fq} -S {sam}".format( + doBowtie = "bowtie2 -p {cpus} -q --end-to-end --very-sensitive -x {ref} {fq} -S {sam}".format( + cpus=cpus, ref=ref, fq=fq, sam=sam) + logger.info(doBowtie) + status, output = subprocess.getstatusoutput(doBowtie) + #trim with "Warning" + output = output.split("\n") + output = [t for t in output if not t.startswith("Warning")] + mapRatio = float(output[-1].split("%")[0]) + sam2bam(sam, bam) + return mapRatio + + +def bam2Bed(bam, bed, mapq=10): + """ + Converting BAM file to BED file. + bam: bam file path + bed: bed file path + mapq: mapq cutoff to remove bad qulity reads. + """ + fd = os.path.splitext(bed)[0] + d = os.path.dirname(bed) + if not os.path.exists(d): + os.mkdir(d) + nb = bam.split("/")[-1] + tmpbam = fd + ".2.bam" + #important for paired end reads, do it all for all kinds of files. + samsort = "samtools sort -n -@ 2 {bam} -T {pre} -o {tmpbam}".format( + bam=bam, tmpbam=nb, pre=nb.replace(".bam", "")) + rmunmaped = "samtools view -b -q {} -F 4 {} >> {}".format(mapq, nb, tmpbam) + callSys([samsort, rmunmaped], logger) + bam2bed = "bamToBed -i {bam} > {bed}".format(bam=tmpbam, bed=bed) + logger.info(bam2bed) + status, output = subprocess.getstatusoutput(bam2bed) + rmbam = "rm {} {}".format(tmpbam, nb) + callSys([rmbam, "gzip %s" % bed], logger) + + +def getUniqueBed(f, fout, chrom, cis=False): + redus = set() + tot = 0 + c = 0 + with gzip.open(fout, "wt") as fo: + for line in gzip.open(f, "rt"): + tot += 1 + line = line.split("\n")[0].split("\t") + if cis and chrom is not None: + if line[0] != chrom: + continue + s = int(line[1]) + e = int(line[2]) + r = (line[0], s, e) + if r in redus: + continue + else: + if line[0] == chrom: + c += 1 + redus.add(r) + fo.write("\t".join(line) + "\n") + return tot, len(redus), c + + +def bed2hicFrag(bed, hicFrag, fo, chrom=None, cis=False): + frags = HTSeq.GenomicArrayOfSets("auto", stranded=False) + for line in open(hicFrag): + line = line.split("\n")[0].split("\t") + if cis and chrom is not None and line[0] != chrom: + continue + iv = HTSeq.GenomicInterval(line[0], int(line[1]), int(line[2])) + name = (line[0], line[1], line[2], line[3]) + frags[iv] += name + c = 0 + with gzip.open(fo, "wt") as fout: + for line in gzip.open(bed, "rt"): + line = line.split("\n")[0].split("\t") + if cis and chrom is not None and line[0] != chrom: + continue + strand = line[5] + if strand == "+": + p = HTSeq.GenomicPosition(line[0], int(line[1])) + else: + p = HTSeq.GenomicPosition(line[0], int(line[2])) + t = list(frags[p]) + if len(t) == 0: + continue + c += 1 + line = t[0] + fout.write("\t".join(line) + "\n") + return c + + +def bed2bdg(f, fout, log=False): + model = HTSeq.GenomicArray("auto", stranded=False) + t = 0 + for line in gzip.open(f, "rt"): + line = line.split("\n")[0].split("\t") + chrom = line[0] + s = int(line[1]) + e = int(line[2]) + iv = HTSeq.GenomicInterval(chrom, s, e) + model[iv] += 1 + t += 1 + with open(fout, "w") as fo: + for iv, value in model.steps(): + if value > 0: + value = value / 1.0 / t * 10**6 #RPM + if log: + value = np.log2(value + 1) + line = [iv.chrom, iv.start, iv.end, value] + line = list(map(str, line)) + fo.write("\t".join(line) + "\n") + + +def main(): + """ + Batch converting from bam to bedpe. + """ + #prepare everything + op = help() + for t in ["bowtie2", "samtools", "bamToBed"]: + if not isTool(t): + logger.error("%s not exits! Please install through conda." % t) + return + if not os.path.exists(op.fq): + logger.error("Input %s not exists! Return." % op.fq) + return + if len(glob(op.ref + "*.bt2")) == 0: + logger.error("Bowtie2 reference not exists for prefix of %s! Return." % + op.ref) + return + if not os.path.exists(op.output): + os.makedirs(op.output, exist_ok=True) + else: + fs = glob(os.path.join(op.output, "*")) + if len(fs) > 0: + logger.info( + "Target output directory %s is not empty, may over-write some files." + % op.output) + return + + + logger.info("%s: Start the analysis of sample."%(op.output)) + bait = op.bait.upper() + enz = op.ligationSite.upper() + + #step 1, get the bait genomic coordinate + logger.info("%s_Step1: Get bait sequence genomic location."%op.output) + vpChrom, vpPos, vpStrand = getBaitPos(bait, op.ref) + + #step 2, pre-process fastq files to only keep the reads there are bait and remove the bait sequence + logger.info("%s_Step2: Trim bait sequence and only keep the target reads."%op.output) + fastq = op.output+"/"+op.output+".fastq.gz" + tot,hasBait= parseSeq(op.fq,fastq,bait, enz) + + #step 3, mapping the target reads to the genome + logger.info("%s_Step3: Map the target reads to the reference genome."%op.output) + sam = op.output+"/"+op.output+".sam" + bam = op.output+"/"+op.output+".bam" + mapRatio = doMap(fastq,op.ref,sam,bam,cpus=op.cpu) + + #step 4, get the high quality unqiue reads + logger.info("%s_Step4: Get the high quality unique reads."%op.output) + bed = op.output+"/"+op.output+".bed" + bam2Bed(bam,bed,mapq=op.mapq) + uniqueBed = op.output+"/"+op.output+"_unique.bed.gz" + totMapped, uniqueMapped, uniqueCis = getUniqueBed(bed + ".gz", uniqueBed,vpChrom,cis=op.cis) + + #step 5, map the reads to fragments + logger.info("%s_Step5: Map the reads to genomic fragments digested."%op.output) + frag = op.output+"/"+op.output+"_frag.bed.gz" + cFrags = bed2hicFrag(uniqueBed,op.genomeFrag,frag,chrom=vpChrom,cis=op.cis) + + #step 6, generate view point bedgraph + logger.info("%s_Step6: Generate visualization bedGraph file."%op.output) + bdg = op.output+"/"+op.output+"_frag.bdg" + bed2bdg(frag, bdg, log=op.log) + + #step 7, generate the qc report + rs = { + "0_totalRawReads":tot, + "1_rawReadsHasBait": hasBait, + "2_baitRatio": hasBait/tot, + "3_trimedReadsMappingRatio": mapRatio, + "4_highQualityMappedReads(MAPQ>=10)":totMapped, + "5_highQualityUniqueReads":uniqueMapped, + "6_redundancy": 1- uniqueMapped/totMapped, + "7_highQualityUniqueCisReads":uniqueCis, + "8_cisRatio": uniqueCis/uniqueMapped, + "9_validFragments": cFrags, + "10_validRatio": cFrags/uniqueMapped, + } + rs = pd.Series(rs) + rs.to_csv(op.output+"/"+op.output+"_report.txt",sep="\t",header=None) + + logger.info("%s:The analysis finished."%(op.output)) + + +if __name__ == '__main__': + main() diff --git a/build/scripts-3.10/anoPeaks.py b/build/scripts-3.10/anoPeaks.py new file mode 100755 index 00000000..e15881b3 --- /dev/null +++ b/build/scripts-3.10/anoPeaks.py @@ -0,0 +1,186 @@ +#!/home/caoy7/anaconda3/envs/astroBoy/bin/python +#--coding:utf-8 -- +""" +anoPeaks.py +cLoops2 anoPeaks.py annotate peaks genomic locations as promoters or enhancers. +""" + +__date__ = "2023-08-10" +__modified__ = "" +__email__ = "caoyaqiang0410@gmail.com" + +#general library +import os +import argparse +from argparse import RawTextHelpFormatter + +#3rd library +import numpy as np +import pandas as pd +from tqdm import tqdm +from scipy.spatial import KDTree + +#cLoops2 +from cLoops2.ano import readGenes, findOverlapOrNearest +from cLoops2.ds import Peak + + +def help(): + """ + Create the command line interface for the script. + """ + description = """ + Annotate peaks as promoters or enhancers according to gene annotations. + + Example: + anoPeaks.py -f H3K27ac_peaks.bed -gtf hg38.gtf -o H3K27ac_peaks + """ + parser = argparse.ArgumentParser(description=description, + formatter_class=RawTextHelpFormatter) + parser.add_argument( + "-f", + dest="peakf", + required=True, + type=str, + help= + "Input .bed file as peaks. Peak id will be renamed as chrom|start|end") + parser.add_argument("-gtf", + dest="gtf", + default="", + required=False, + type=str, + help="GTF file annotation for genes.") + parser.add_argument( + "-tid", + dest="tid", + default=False, + required=False, + action="store_true", + help= + "Whether to use transcript id instead of gene id for annotation. Default\n"\ + "is not." + ) + parser.add_argument( + "-pdis", + dest="pdis", + default=2000, + required=False, + type=int, + help= + "Distance limitation for anchor to nearest gene/transcript TSS to define\n"\ + "as promoter. Default is 2000 bp." + ) + parser.add_argument("-o", + dest="output", + required=True, + type=str, + help="Output prefix.") + op = parser.parse_args() + return op + + +def parseBed2Peaks(fbed): + """ + """ + peaks = {} + for line in open(fbed): + line = line.split("\n")[0].split("\t") + if len(line) < 3: + continue + chrom = line[0] + if chrom not in peaks: + peaks[chrom] = [] + peak = Peak() + peak.id = "|".join(line[:3]) + peak.chrom = chrom + peak.start = int(line[1]) + peak.end = int(line[2]) + peak.length = peak.end - peak.start + peaks[chrom].append(peak) + return peaks + + +def anoPeaks( + peakf, + fout, + gtf, + tid=False, + pdis=2000, +): + """ + Annotate peaks. + @param peakf: str, name of peaks file, .bed file + @param fout: str, output prefix + @param gtf: str, GTF file name + @param tid: bool, if set true, use transcript id for alternative TSS + @param pdis: <=distance nearest TSS to define as promoter + """ + if not os.path.isfile(peakf): + print("Input %s not exists, return." % peakf) + return + elif not os.path.isfile(gtf): + print("Input %s not exists, return." % gtf) + return + else: + #gene annotions, {chrom:{tss:g}}, tss is int + genes = readGenes(gtf, tid=tid) + peaks = parseBed2Peaks(peakf) + #store results + rs = {} + #find nearest TSS + print("annotating peaks" ) + for chrom in tqdm(peaks.keys()): + if chrom not in genes: + continue + gs = genes[chrom] + ts = np.array([[tss] for tss in gs.keys()]) + cov = {} + for tss, g in gs.items(): + cov[tss] = g + tree = KDTree(ts) + for peak in peaks[chrom]: + xgs, xds = findOverlapOrNearest(gs, ts, tree, peak.start, + peak.end) + if len(xgs) > 1: + xt = "Promoter" + xd = 0 + else: + xd = xds[0] + if abs(xd) <= pdis: + xt = "Promoter" + else: + xt = "Enhancer" + rs[peak.id] = { + "1_chrom": + peak.chrom, + "2_start": + peak.start, + "3_end": + peak.end, + "4_type": + xt, + "5_nearestDistanceToTSS": + xd, + "6_nearestTargetTSS": + ",".join([ + xg.chrom + ":" + str(xg.start) + "-" + str(xg.end) + + "|" + xg.strand + "|" + xg.name for xg in xgs + ]), + } + rs = pd.DataFrame(rs).T + rs.to_csv(fout + "_anoPeaks.txt", sep="\t", index_label="peakId") + + +def main(): + op = help() + anoPeaks( + op.peakf, + op.output, + op.gtf, + tid=op.tid, + pdis=op.pdis, + ) + + +if __name__ == "__main__": + main() diff --git a/build/scripts-3.10/compareComp.py b/build/scripts-3.10/compareComp.py new file mode 100755 index 00000000..51b9eecd --- /dev/null +++ b/build/scripts-3.10/compareComp.py @@ -0,0 +1,407 @@ +#!/home/caoy7/anaconda3/envs/astroBoy/bin/python +#--coding:utf-8 -- +""" +compareComp.py +cLoops2 compareComp.py compare compartment PC1 values based on Mahalanobis distance and annotate the changed bins. + +""" + +__date__ = "2023-03-09" +__modified__ = "" +__email__ = "caoyaqiang0410@gmail.com" + +#sys library +import argparse +from argparse import RawTextHelpFormatter + +#3rd library +import pylab +import HTSeq +import pandas as pd +import numpy as np + +#cLoops2 +from cLoops2.stat import twoPassesMDTest +from cLoops2.settings import * + + +def help(): + """ + Create the command line interface for the script. + """ + description = """ + Pair-wisely compare compartments PC1 based on two-passes Mahalanobis distance. + + Example: + compareComp.py -a young_pc1.bdg -b old_pc1.bdg -o youngVsOld -na Young -b Old -pcut 0.01 -gtf mm10.gtf + """ + parser = argparse.ArgumentParser(description=description, + formatter_class=RawTextHelpFormatter) + parser.add_argument( + "-a", + dest="fa", + required=True, + type=str, + help= + "Input bedGraph file for the first sample containing the PC1 scores from comparment analysis. Should be aligned for A/B considering gene density or CpG islands contents. Better to be a control such as young or wild-type sample." + ) + parser.add_argument( + "-b", + dest="fb", + required=True, + type=str, + help= + "Input bedGraph file for the second sample containing the PC1 scores from comparment analysis." + ) + parser.add_argument( + "-na", + dest="na", + type=str, + required=True, + help= + "Name of first sample, will be shown in the output figure. Only use alphabet and numbers." + ) + parser.add_argument( + "-nb", + dest="nb", + type=str, + required=True, + help="Name of second sample, will be shown in the output figure.") + parser.add_argument( + "-gtf", + dest="gtf", + default="", + required=True, + type=str, + help= + "GTF file annotation for genes. Significant flip/switch overlapped genes will be reported based on the gene annotation file." + ) + parser.add_argument( + "-pcut", + dest="pcut", + type=float, + default=0.01, + help= + "Chi-Square p-value cutoff for detecting siginficant different compartment, default is 0.01." + ) + + parser.add_argument("-o", + dest="output", + required=True, + type=str, + help="Output prefix.") + op = parser.parse_args() + return op + + +def stichBins2Comp(rs): + """ + Stich bins to compartment + """ + comps = [] + i = 0 + while i < len(rs): + j = i + 1 + p = i + while j < len(rs): + if rs[j][-1] * rs[i][-1] > 0: + p = j + j += 1 + else: + break + if rs[i][-1] > 0: + flag = "compartmentA" + else: + flag = "compartmentB" + s = rs[i][1] + e = rs[p][2] + vs = [rs[t][-1] for t in range(i, j)] + v = np.mean(vs) + comps.append([ + rs[i][0], s, e, + rs[i][0] + ":" + str(s) + "-" + str(e) + "|" + flag, + str(v) + ]) + i = j + return comps + + +def getCompartment(bdgf, fout): + """ + Get compartment according to PC1 values. + """ + ds = {} + for line in open(bdgf): + line = line.split("\n")[0].split("\t") + if line[0] not in ds: + ds[line[0]] = [] + line[-1] = float(line[-1]) + ds[line[0]].append(line) + #stich domains + cs = list(ds.keys()) + cs.sort() + with open(fout, "w") as fo: + for c in cs: + rs = stichBins2Comp(ds[c]) + for r in rs: + fo.write("\t".join(r) + "\n") + + +def readBdg(f): + """ + Read bedGraph file for PC1. + """ + ds = {} #pandas series + for line in open(f): + line = line.split("\n")[0].split("\t") + k = line[0] + ":" + line[1] + "-" + line[2] + v = float(line[-1]) + ds[k] = v + ds = pd.Series(ds) + return ds + + +def getFlips(sa, sb, ps, inds): + """ + Get compartment flips according to PC1 and p-values. + + @param sa: pd.Series, PC1 values for sample A + @param sb: pd.Series, PC1 values for sample B + @param ps: pd.Series, p-values + @param inds: pd.Series.idnex, p-value select index + + @return a2b: pd.Series.idnex, A to B flips + @return b2a: pd.Series.idnex, B to A flips + @return a2a: pd.Series.idnex, A to A changes + @return b2b: pd.Series.index, B to B changes + """ + #AtoB flips + ta = sa[sa > 0].index + tb = sb[sb < 0].index + a2b = ta.intersection(tb).intersection(inds) + a2b = ps[a2b].sort_values(inplace=False, ascending=True).index + + #BtoA flips + ta = sa[sa < 0].index + tb = sb[sb > 0].index + b2a = ta.intersection(tb).intersection(inds) + b2a = ps[b2a].sort_values(inplace=False, ascending=True).index + + #AtoA + ta = sa[sa > 0].index + tb = sb[sb > 0].index + a2a = ta.intersection(tb).intersection(inds) + a2a = ps[a2a].sort_values(inplace=False, ascending=True).index + + #BtoB + ta = sa[sa < 0].index + tb = sb[sb < 0].index + b2b = ta.intersection(tb).intersection(inds) + b2b = ps[b2b].sort_values(inplace=False, ascending=True).index + return a2b, b2a, a2a, b2b + + +def plotChanges(data, na, nb, a2b, b2a, a2a, b2b, pcut, output): + """ + Plot the compartment changes. + """ + + #plot the raw dots + fig, ax = pylab.subplots(figsize=(3.2, 2.2)) + ax.scatter(data[na], + data[nb], + s=0.5, + color="gray", + alpha=0.6, + label="total %s bins" % data.shape[0]) + + #plot the changes + ax.scatter(data[na][a2b], + data[nb][a2b], + s=1, + color=colors[0], + alpha=0.8, + label="A->B %s bins" % len(a2b)) + ax.scatter(data[na][b2a], + data[nb][b2a], + s=1, + color=colors[2], + alpha=0.8, + label="B->A %s bins" % len(b2a)) + ax.scatter(data[na][a2a], + data[nb][a2a], + s=1, + color=colors[3], + alpha=0.8, + label="A->A %s bins" % len(a2a)) + ax.scatter(data[na][b2b], + data[nb][b2b], + s=1, + color=colors[4], + alpha=0.8, + label="B->B %s bins" % len(b2b)) + + leg = ax.legend( + bbox_to_anchor=(1.05, 1.0), + loc='upper left', + labelcolor=["gray", colors[0], colors[2], colors[3], colors[4]]) + for h in leg.legendHandles: + h._sizes = [10] + ax.axvline(0, color="gray", linestyle="--") + ax.axhline(0, color="gray", linestyle="--") + ax.set_xlabel(f"{na} PC1") + ax.set_ylabel(f"{nb} PC1") + ax.set_title(f"Mahalanobis distance P-value < {pcut}") + pylab.savefig(f"{output}_bins_flips.pdf") + + +def parseGtfFeature(t): + ds = {} + t = t.replace('"', '') + for n in t.split('; '): + s = n.split() + ds[s[0]] = s[1] + return ds + + +def readCompGenes(compf, gtf): + """ + Read compartment and gene sets regions as HTSeq.GenomicArrayOfSets. + """ + #read compartments + comps = HTSeq.GenomicArrayOfSets("auto", stranded=False) + for line in open(compf): + line = line.split("\n")[0].split("\t") + iv = HTSeq.GenomicInterval(line[0], int(line[1]), int(line[2])) + comps[iv] += line[3] + #read gtf + genes = HTSeq.GenomicArrayOfSets("auto", stranded=False) + gs = {} + for line in open(gtf): + if line.startswith("#"): + continue + line = line.split("\n")[0].split("\t") + if line[2] != 'exon': + continue + ds = parseGtfFeature(line[8]) + key = "|".join([ds["gene_id"], ds["gene_name"]]) + nline = [ + line[0], line[3], line[4], + "|".join([ds["gene_id"], ds["gene_name"]]), ".", line[6] + ] + if key not in gs: + gs[key] = [line[0], int(line[3]), int(line[4])] + else: + if int(line[3]) < gs[key][1]: + gs[key][1] = int(line[3]) + if int(line[4]) > gs[key][2]: + gs[key][2] = int(line[4]) + for g, v in gs.items(): + iv = HTSeq.GenomicInterval(v[0], v[1], v[2]) + genes[iv] += g + return comps, genes + + +def anoBins(data, na, nb, s, comps, genes, fout): + """ + Annotate changed bins. + """ + ds = {} + ags = set() + for t in s: + chrom = t.split(":")[0] + start = t.split(":")[1].split("-")[0] + end = t.split(":")[1].split("-")[1] + iv = HTSeq.GenomicInterval(chrom, int(start), int(end)) + comp = list(list(comps[iv].steps())[0][1])[0] + gs = set() + for i, g in genes[iv].steps(): + gs.update(g) + gs = list(gs) + if len(gs) > 0: + gs = [g.split("|")[-1] for g in gs] + ags.update(gs) + gs = ",".join(gs) + ds[t] = { + "chrom": chrom, + "start": start, + "end": end, + f"{na} PC1": data.loc[t, na], + f"{nb} PC1": data.loc[t, nb], + "P-value": data.loc[t, "Chi-Square test P-value"], + "compartmentId": comp, + "overlappedGenes": gs, + } + ds = pd.DataFrame(ds).T + ds.to_csv(fout + ".txt", index_label="binId", sep="\t") + ags = list(ags) + with open(fout + "_genes.list", "w") as fo: + fo.write("\n".join(ags)) + + +def compareComp(fa, fb, na, nb, gtf, output, pcut=0.01): + """ + Pair-wise comparsion of compartments. + """ + print("Step 1: Stiching bins as compartments.") + #step 1 get the compartment + acompf = output + "_" + na + "_compartments.bed" + bcompf = output + "_" + nb + "_compartments.bed" + getCompartment(fa, acompf) + getCompartment(fb, bcompf) + + print("Step 2: Performing two-passes mahalanobis distances caculation.") + #step 2 prepare the bins level data + sa = readBdg(fa) + sb = readBdg(fb) + t = sa.index.intersection(sb.index) + sa = sa[t] + sb = sb[t] + data = pd.DataFrame({na: sa, nb: sb}) + + #step 3 two passes MD test for bins + dis, ps = twoPassesMDTest(data, pcut) + inds = ps[ps < pcut].index + + #step 4 get flips or same compartment chanegs + a2b, b2a, a2a, b2b = getFlips(sa, sb, ps, inds) + + print("Step 3: Plotting switch bins.") + #step 5 show the changes + plotChanges(data, na, nb, a2b, b2a, a2a, b2b, pcut, output) + + print("Step 4: Outputing results.") + #step 6 output the p-values as bdg + ps[ps < 1e-300] = 1e-300 + with open(f"{output}_-logP.bdg", "w") as fout: + for i in ps.index: + chrom = i.split(":")[0] + start = i.split(":")[1].split("-")[0] + end = i.split(":")[1].split("-")[1] + line = [chrom, start, end, str(-np.log10(ps[i]))] + fout.write("\t".join(line) + "\n") + + #step 7 output the distance and p-values + data["Mahalanobis distance"] = dis + data["Chi-Square test P-value"] = ps + data.to_csv(f"{output}_MD_p-values.txt", sep="\t", index_label="binId") + + #step 8 annotate the changed bins and associated genes and output as bed files + ds = { + "AtoB": a2b, + "BtoA": b2a, + "AtoA": a2a, + "BtoB": b2b, + } + comps, genes = readCompGenes(acompf, gtf) + for k, v in ds.items(): + anoBins(data, na, nb, v, comps, genes, f"{output}_{k}_bins") + + +def main(): + op = help() + compareComp(op.fa, op.fb, op.na, op.nb, op.gtf, op.output, op.pcut) + + +if __name__ == "__main__": + main() diff --git a/build/scripts-3.10/compareDom.py b/build/scripts-3.10/compareDom.py new file mode 100755 index 00000000..1ef829b5 --- /dev/null +++ b/build/scripts-3.10/compareDom.py @@ -0,0 +1,514 @@ +#!/home/caoy7/anaconda3/envs/astroBoy/bin/python +#--coding:utf-8 -- +""" + +compareDom.py +cLoops2 compareDom.py quantify and compare Hi-C domains between two conditions. +2023-04-10: modified test without MA conversion +""" + +__date__ = "2023-03-14" +__modified__ = "" +__email__ = "caoyaqiang0410@gmail.com" + +#sys library +import os +import json +import argparse +from glob import glob +from argparse import RawTextHelpFormatter + +#3rd library +import pylab +import HTSeq +import pandas as pd +import numpy as np +from tqdm import tqdm +from joblib import Parallel, delayed + +#cLoops2 +from cLoops2.est import estSfMANorm +from cLoops2.ds import XY +from cLoops2.io import parseIxy +from cLoops2.stat import twoPassesMDTest +from cLoops2.settings import * + + +def help(): + """ + Create the command line interface for the script. + """ + description = """ + Pair-wise quantification and comparsion of domains based on two-passes Mahalanobis distance. + + Example: + compareDom.py -f dom.bed -a young -b old -o youngVsOld -na Young -b Old -pcut 0.01 -gtf mm10.gtf + """ + parser = argparse.ArgumentParser(description=description, + formatter_class=RawTextHelpFormatter) + parser.add_argument("-f", + dest="domf", + required=True, + type=str, + help="Domain file in .bed format.") + parser.add_argument( + "-a", + dest="da", + required=True, + type=str, + help="The data directory generated by cLoops2 pre for the first sample." + ) + parser.add_argument( + "-b", + dest="db", + required=True, + type=str, + help= + "The data directory generated by cLoops2 pre for the second sample.") + parser.add_argument( + "-na", + dest="na", + type=str, + required=True, + help= + "Name of first sample, will be shown in the output figure. Only use alphabet and numbers." + ) + parser.add_argument( + "-nb", + dest="nb", + type=str, + required=True, + help="Name of second sample, will be shown in the output figure.") + parser.add_argument( + "-gtf", + dest="gtf", + default="", + required=True, + type=str, + help= + "GTF file annotation for genes. Significant flip/switch overlapped genes will be reported based on the gene annotation file." + ) + parser.add_argument( + "-pcut", + dest="pcut", + type=float, + default=0.01, + help= + "Chi-Square p-value cutoff for detecting siginficant different compartment, default is 0.01." + ) + + parser.add_argument("-o", + dest="output", + required=True, + type=str, + help="Output prefix.") + parser.add_argument( + "-p", + dest="cpu", + required=False, + type=int, + default=4, + help="How many cpus used for quantification, default is 4.") + parser.add_argument( + "-xlim", + dest="xlim", + required=False, + type=str, + default=None, + help= + "X-axis limitations for the scatter plot, default is autodetermined, set as '1,-1'." + ) + parser.add_argument( + "-ylim", + dest="ylim", + required=False, + type=str, + default=None, + help= + "Y-axis limitations for the scatter plot, default is autodetermined, set as '1,-1'." + ) + parser.add_argument("-vmin", + dest="vmin", + required=False, + type=str, + default=None, + help="Minimal scale for the aggregation heatmap.") + parser.add_argument("-vmax", + dest="vmax", + required=False, + type=str, + default=None, + help="Maxmal scale for the aggregation heatmap.") + + op = parser.parse_args() + return op + + +def parseDom(f): + doms = {} + for line in open(f): + if line[0] == "#": + continue + line = line.split("\n")[0].split("\t") + if len(line) < 3: + continue + if line[0] not in doms: + doms[line[0]] = [] + doms[line[0]].append(line) + return doms + + +def _quant(rs, fixy, tot): + ds = {} + key, mat = parseIxy(fixy) + xy = XY(mat[:, 0], mat[:, 1]) + for r in rs: + key = r[0] + ":" + r[1] + "-" + r[2] + t = xy.queryPeak(int(r[1]), int(r[2])) + b = xy.queryPeakBoth(int(r[1]), int(r[2])) + ds[key] = { + "total": + len(t), + "internalCounts": + len(b), + "externalCounts": + len(t.difference(b)), + "ES(internal/external)": + float(len(b)) / float(len(t.difference(b))), + "internalRPKM": + float(len(b)) / (int(r[2]) - int(r[1])) / tot * 10**9, + "externalRPKM": + float(len(t.difference(b))) / (int(r[2]) - int(r[1])) / tot * + 10**9, + } + return ds + + +def quant(d, doms, cpu): + fs = glob(d + "/*.ixy") + metaf = d + "/petMeta.json" + meta = json.loads(open(metaf).read()) + tot = meta["Unique PETs"] + nfs = {} + for f in fs: + c = f.split("/")[-1].split("-")[0] + nfs[c] = f + #get the data + cs = list(doms.keys()) + ds = Parallel(n_jobs=cpu, backend="multiprocessing")(delayed(_quant)( + doms[c], + nfs[c], + tot, + ) for c in tqdm(cs)) + nds = {} + for d in ds: + for k, v in d.items(): + nds[k] = v + ds = pd.DataFrame(nds).T + return tot, ds + + +def summary(domsa, domsb, na, nb): + data = {} + for c in domsa.columns: + data[na + "_" + c] = domsa[c] + data[nb + "_" + c] = domsb[c] + data = pd.DataFrame(data) + return data + + +def plotChanges(data, na, nb, ra, rb, pcut, output, xlim=None, ylim=None): + """ + Plot the domain changes with MA-plot. + """ + sa = data[f"{na}_ES"] + sb = data[f"{nb}_ES"] + + #plot the raw dots + fig, ax = pylab.subplots(figsize=(3.2, 2.2)) + ax.scatter(sa, + sb, + s=0.5, + color="gray", + alpha=0.6, + label="total %s domains" % data.shape[0]) + #plot the changes + ax.scatter(sa[ra], + sb[ra], + s=2, + color=colors[0], + alpha=0.8, + label="%s domains" % len(ra)) + ax.scatter(sa[rb], + sb[rb], + s=2, + color=colors[1], + alpha=0.8, + label="%s domains" % len(rb)) + leg = ax.legend(bbox_to_anchor=(1.05, 1.0), + loc='upper left', + labelcolor=["gray", colors[0], colors[1]]) + for h in leg.legendHandles: + h._sizes = [10] + ax.set_xlabel(f"{na} domain ES") + ax.set_ylabel(f"{nb} domain ES") + ax.set_title(f"ES comparsion\nMahalanobis distance P-value < {pcut}") + s = np.min([np.min(sa), np.min(sb)]) + e = np.max([np.max(sa), np.max(sb)]) + ax.plot([s, e], [s, e], color="gray", linestyle="--") + if xlim is not None: + xlim = list(map(float, xlim.split(","))) + xlim.sort() + ax.set_xlim(xlim) + if ylim is not None: + ylim = list(map(float, ylim.split(","))) + ylim.sort() + ax.set_ylim(ylim) + pylab.savefig(f"{output}_domainChanges.pdf") + + +def writeBed(rs, fo): + with open(fo, "w") as fout: + for t in rs: + chrom = t.split(":")[0] + start = t.split(":")[1].split("-")[0] + end = t.split(":")[1].split("-")[1] + line = [chrom, start, end, t] + fout.write("\t".join(line) + "\n") + + +def plotDiffAggDomains(fa, fb, tota, totb, na, nb, fout, vmin=None, vmax=None): + mata = np.load(fa, mmap_mode="r") + mata = mata["arr_0"] + matb = np.load(fb, mmap_mode="r") + matb = matb["arr_0"] + ta = tota / 10**6 + tb = totb / 10**6 + mata = np.log2((mata + 1) / ta) + matb = np.log2((matb + 1) / tb) + mat = [] + for i in range(mata.shape[0]): + atmat = mata[i] + btmat = matb[i] + nmat = btmat - atmat + mat.append(nmat) + mat = np.array(mat) + mat = np.mean(mat, axis=0) + label = f"normalized log2({nb}/{na})" + #if vmin == None: + # vmin = 0 + fig, ax = pylab.subplots(figsize=(4, 4)) + cmap = sns.color_palette("RdBu_r", 11).as_hex() + cmap[int(len(cmap) / 2)] = "#FFFFFF" + cmap = ListedColormap(cmap) + sns.heatmap(mat, + xticklabels=False, + yticklabels=False, + square=True, + linewidths=0.0, + vmin=vmin, + vmax=vmax, + cmap=cmap, + cbar_kws={ + 'label': label, + "shrink": 0.5 + }) + #draw the box + ax.axvline(x=ax.get_xlim()[0], color="k", linewidth=2) + ax.axvline(x=ax.get_xlim()[1], color="k", linewidth=2) + ax.axhline(y=ax.get_ylim()[0], color="k", linewidth=2) + ax.axhline(y=ax.get_ylim()[1], color="k", linewidth=2) + ax.set_title("log2(%s-%s) %s domains" % (nb, na, mata.shape[0])) + pylab.savefig("%s_diffAggDomains.pdf" % fout) + + +def parseGtfFeature(t): + ds = {} + t = t.replace('"', '') + for n in t.split('; '): + s = n.split() + ds[s[0]] = s[1] + return ds + + +def readGenes(gtf): + """ + Read gene sets regions as HTSeq.GenomicArrayOfSets. + """ + #read gtf + genes = HTSeq.GenomicArrayOfSets("auto", stranded=False) + gs = {} + for line in open(gtf): + if line.startswith("#"): + continue + line = line.split("\n")[0].split("\t") + if line[2] != 'exon': + continue + ds = parseGtfFeature(line[8]) + key = "|".join([ds["gene_id"], ds["gene_name"]]) + nline = [ + line[0], line[3], line[4], + "|".join([ds["gene_id"], ds["gene_name"]]), ".", line[6] + ] + if key not in gs: + gs[key] = [line[0], int(line[3]), int(line[4])] + else: + if int(line[3]) < gs[key][1]: + gs[key][1] = int(line[3]) + if int(line[4]) > gs[key][2]: + gs[key][2] = int(line[4]) + for g, v in gs.items(): + iv = HTSeq.GenomicInterval(v[0], v[1], v[2]) + genes[iv] += g + return genes + + +def anoBins(data, na, nb, s, genes, fout): + """ + Annotate changed bins. + """ + ds = {} + ags = set() + for t in s: + chrom = t.split(":")[0] + start = t.split(":")[1].split("-")[0] + end = t.split(":")[1].split("-")[1] + iv = HTSeq.GenomicInterval(chrom, int(start), int(end)) + gs = set() + for i, g in genes[iv].steps(): + gs.update(g) + gs = list(gs) + if len(gs) > 0: + gs = [g.split("|")[-1] for g in gs] + ags.update(gs) + gs = ",".join(gs) + ds[t] = { + "chrom": chrom, + "start": start, + "end": end, + "overlappedGenes": gs, + } + for c in data.columns: + ds[t][c] = data.loc[t, c] + ds = pd.DataFrame(ds).T + ds.to_csv(fout + ".txt", index_label="domId", sep="\t") + ags = list(ags) + with open(fout + "_genes.list", "w") as fo: + fo.write("\n".join(ags)) + + +def compareDom( + domf, + da, + db, + na, + nb, + gtf, + output, + pcut=0.01, + cpu=4, + xlim=None, + ylim=None, + vmin=None, + vmax=None, +): + """ + Pair-wise comparsion of domains. + + @param domf: str, domains in bed format, file path + @param da: str, cLoops2 pre generated data directory for the first sample + @param db: str, cLoops2 pre generated data directory for the second sample + @param na: str, name of the first sample + @param nb: str, name of the second sample + @param gtf: str, file path of the gtf annotaiton file + @param output: str, output prefix + @param pcut: float, p-value cutoff for the two-passes MD test + @param cpu: int, number of CPUs for running the job. + """ + + print("Step 1: Quantifying domains.") + #step 1 quantify the domains + doms = parseDom(domf) + tota, domsa = quant(da, doms, cpu) + totb, domsb = quant(db, doms, cpu) + data = summary(domsa, domsb, na, nb) + data.to_csv(f"{output}_domainQuant.txt", sep="\t", index_label="domId") + #data = pd.read_csv(f"{output}_domainQuant.txt", index_col=0, sep="\t") + + #step 2 pair-wise comparsion + print("Step 2: Performing two-passes mahalanobis distances caculation.") + #use the ES with MA plot to run the two-passes MD test + sa = data[f"{na}_ES(internal/external)"] + sb = data[f"{nb}_ES(internal/external)"] + + a = (sa + sb) / 2 + m = np.log2(sb / sa) + """ + #MANorm2 + sf = estSfMANorm(sa, sb) + sbn = pd.Series( [sf[0]*t + sf[1] for t in sb],index=sb.index ) + a = (sa+sbn )/2 + m = np.log2(sbn/sa) + """ + + #esdata = pd.DataFrame({"A": a, "M": m}) + esdata = pd.DataFrame({f"{na}_ES": sa, f"{nb}_ES": sb}) + dis, ps = twoPassesMDTest(esdata, pcut) + ps[ps < 1e-300] = 1e-300 + inds = ps[ps < pcut].index + esdata["Mahalanobis distance"] = dis + esdata["Chi-Square test P-value"] = ps + esdata.to_csv(f"{output}_ES_MD_p-values.txt", + sep="\t", + index_label="domId") + ra = sa[sa > sb].index.intersection(inds) + ra = m[ra].sort_values(inplace=False, ascending=True).index + rb = sb[sb > sa].index.intersection(inds) + rb = m[rb].sort_values(inplace=False, ascending=False).index + + #data["ES_M"] = esdata["M"] + #data["ES_A"] = esdata["A"] + data["Mahalanobis distance"] = esdata["Mahalanobis distance"] + data["Chi-Square test P-value"] = esdata["Chi-Square test P-value"] + data = pd.read_csv(f"{output}_domainQuant.txt", index_col=0, sep="\t") + + #step 3 show the changes + plotChanges(esdata, na, nb, ra, rb, pcut, output, xlim, ylim) + + #step 4 output the bed files + writeBed(ra, f"{output}_{na}_specific.bed") + writeBed(rb, f"{output}_{nb}_specific.bed") + + print("Step 3: Performing aggregation analysis.") + #step 5 plot the aggregation plots + c1 = f"cLoops2 agg -d {da} -domains {output}_{na}_specific.bed -o {output}_{na}_specific_{na} -p {cpu} -domain_ext 0.25" + c2 = f"cLoops2 agg -d {db} -domains {output}_{na}_specific.bed -o {output}_{na}_specific_{nb} -p {cpu} -domain_ext 0.25" + c3 = f"cLoops2 agg -d {da} -domains {output}_{nb}_specific.bed -o {output}_{nb}_specific_{na} -p {cpu} -domain_ext 0.25" + c4 = f"cLoops2 agg -d {db} -domains {output}_{nb}_specific.bed -o {output}_{nb}_specific_{nb} -p {cpu} -domain_ext 0.25" + for c in [c1, c2, c3, c4]: + print(c) + os.system(c) + plotDiffAggDomains(f"{output}_{na}_specific_{na}_aggDomains.npz", + f"{output}_{na}_specific_{nb}_aggDomains.npz", tota, + totb, na, nb, f"{output}_{na}_specific", vmin, vmax) + plotDiffAggDomains(f"{output}_{nb}_specific_{na}_aggDomains.npz", + f"{output}_{nb}_specific_{nb}_aggDomains.npz", tota, + totb, na, nb, f"{output}_{nb}_specific", vmin, vmax) + + #step 6 annotate the domains + print("Step 4: Annotating specific domains.") + genes = readGenes(gtf) + anoBins(data, na, nb, ra, genes, f"{output}_{na}_specificDomains") + anoBins(data, na, nb, rb, genes, f"{output}_{nb}_specificDomains") + + print("Finished") + + +def main(): + op = help() + compareDom(op.domf, op.da, op.db, op.na, op.nb, op.gtf, op.output, op.pcut, + op.cpu, op.xlim, op.ylim, op.vmin, op.vmax) + + +if __name__ == "__main__": + main() diff --git a/build/scripts-3.10/getBedpeFBed.py b/build/scripts-3.10/getBedpeFBed.py new file mode 100755 index 00000000..65627aef --- /dev/null +++ b/build/scripts-3.10/getBedpeFBed.py @@ -0,0 +1,110 @@ +#!/home/caoy7/anaconda3/envs/astroBoy/bin/python +#--coding:utf-8-- +""" +getBedpeFBed.py +Transfering single-end BED file to paired-end BEDPE file as input of cLoops2 . +""" +#systematic library +import os, time, gzip, argparse, sys +from datetime import datetime +from argparse import RawTextHelpFormatter + +#3rd library + +#cLoops2 +from cLoops2.ds import PET +from cLoops2.utils import cFlush + + +def help(): + description = """ + Transfering single-end BED file to paired-end BEDPE file as input of + cLoops2 for furthur analysis. + The 6th column of the BED file of strand information is used to extend + the fragments. + If no strand information available, default treat it as + strand + Example: + getBedpeFBed.py -f a.bed.gz -o a + """ + + parser = argparse.ArgumentParser(description=description, + formatter_class=RawTextHelpFormatter) + parser.add_argument( + '-f', + dest="fin", + required=True, + type=str, + help= + "Input bed files, or .bed.gz files. " + ) + parser.add_argument('-o', + dest="out", + required=True, + type=str, + help="Output file prefix.") + parser.add_argument( + '-ext', + dest="ext", + required=False, + default=150, + type=int, + help= + "The expect fragment length of the bed file to extend from 5' to 3', default is 150." + ) + + op = parser.parse_args() + return op + + +def bed2bedpe(fin, fout, ext=150): + """ + Extend the BED file to BEDPE file according to expected fragment size. + """ + if fin.endswith(".gz"): + fino = gzip.open(fin, "rt") + else: + fino = open(fin) + if fout.endswith(".gz"): + fo = gzip.open(fout, "wt") + else: + fo = open(fout, "w") + for i, line in enumerate(fino): + if i % 10000 == 0: + cFlush("%s read from %s" % (i,fin)) + line = line.split("\n")[0].split('\t') + if len(line) < 6: #no strand information + nline = [ + line[0], line[1], line[2], line[0], + int(line[1]) + ext, + int(line[2]) + ext, ".", "44", "+", "-" + ] + elif line[5] == "+": + nline = [ + line[0], line[1], line[2], line[0], + int(line[1]) + ext, + int(line[2]) + ext, ".", "44", "+", "-" + ] + else: + nline = [ + line[0], + max(0, int(line[1])), + max(0, + int(line[2]) - ext), line[0], line[1], line[2], ".", "44", + "+", "-" + ] + nline = "\t".join(list(map(str, nline))) + "\n" + fo.write(nline) + fino.close() + fo.close() + + +def main(): + op = help() + bed2bedpe(op.fin, op.out+".bedpe.gz", ext=op.ext) + + +if __name__ == "__main__": + start_time = datetime.now() + main() + usedtime = datetime.now() - start_time + sys.stderr.write("Process finished. Used CPU time: %s Bye!\n" % usedtime) diff --git a/build/scripts-3.10/getDI.py b/build/scripts-3.10/getDI.py new file mode 100755 index 00000000..883c3f82 --- /dev/null +++ b/build/scripts-3.10/getDI.py @@ -0,0 +1,143 @@ +#!/home/caoy7/anaconda3/envs/astroBoy/bin/python +#--coding:utf-8 -- +""" +getDI.py +Calculating the Directionality Index according to the paper of Topological Domains in Mammalian Genomes Identified by Analysis of Chromatin Interactions +DI = (B-A) / (|B-A|) * ( (A-E)**2 + (B-E)**2 ) / E +E = (A+B)/2 +""" +__date__ = "2019-09-11" +__modified__ = "" +__email__ = "caoyaqiang0410@gmail.com" + +#general library +import os, argparse +from glob import glob +from collections import Counter +from datetime import datetime +from argparse import RawTextHelpFormatter + +#3rd library +import numpy as np +import pandas as pd +from tqdm import tqdm + +#cLoops2 +from cLoops2.ds import XY +from cLoops2.io import parseIxy +from cLoops2.settings import * + + +def help(): + """ + Create the command line interface for the script. + """ + description = """ + Caculate the Directionality Index for a specific region. + The output .bdg is the bedGraph result for the regions with Directionality Index. + DI is defined as accroding to the formula in following paper: + Topological Domains in Mammalian Genomes Identified by Analysis of Chromatin Interactions + Example: + getDI.py -f GM12878_Trac/chr21-chr21.ixy -o GM12878_Trac_chr21 + """ + parser = argparse.ArgumentParser(description=description, + formatter_class=RawTextHelpFormatter) + parser.add_argument( + "-f", + dest="fixy", + required=True, + type=str, + help= + "Input .ixy file generated by cLoops2 to caculate insulation score.") + parser.add_argument("-o", + dest="output", + required=True, + type=str, + help="Output prefix.") + parser.add_argument( + "-start", + dest="start", + required=False, + default=-1, + type=int, + help= + "Start genomic coordinate for the target region, default is the minmial corrdinate found in the file." + ) + parser.add_argument( + "-end", + dest="end", + required=False, + default=-1, + type=int, + help= + "End genomic coordinate for the target region, default is the maxmial corrdinate found in the file." + ) + parser.add_argument( + "-bs", + dest="binSize", + required=False, + default=5000, + type=int, + help="Bin size (bp) to generate the contact matrix, default is 5000 bp." + ) + parser.add_argument( + "-s", + dest="step", + required=False, + default=100000, + type=int, + help= + "The upstream and downstream extension to caculate insulaiton score, default is 100000 bp." + ) + op = parser.parse_args() + return op + + +def calcDI(f, fout, start=-1, end=-1, bs=10000, step=100000): + """ + Calculation of insulation score, output as .bedGraph file. + """ + print("loading %s" % f) + key, mat = parseIxy(f, cut=0) + xy = XY(mat[:, 0], mat[:, 1]) + if key[0] != key[1]: + print( + "DI can be only caculated for intra-chromosomal interactions. Return." + ) + return + if start == -1: + start = np.min(xy.xs) + step + if end == -1: + end = np.max(xy.ys) - step + bins = int((end - start) / bs) + print("caculating from %s to %s of %s bins" % (start, end, bins)) + ds = [] + ss = [] + with open(fout + ".bdg", "w") as fo: + for i in tqdm(range(bins)): + x = start + i * bs + a = len(xy.queryPeakBoth(x - step, x)) + b = len(xy.queryPeakBoth(x, x + step)) + e = (a + b) / 2 + if e == 0: + continue + if a == b: + di = 0 + else: + di = (b - a) / np.abs(b - a) * ((a - e)**2 + (b - e)**2) / e + line = [key[0], x, x + bs, di] + fo.write("\t".join(list(map(str, line))) + "\n") + + +def main(): + op = help() + calcDI(op.fixy, + op.output, + start=op.start, + end=op.end, + bs=op.binSize, + step=op.step) + + +if __name__ == "__main__": + main() diff --git a/build/scripts-3.10/getFRiF.py b/build/scripts-3.10/getFRiF.py new file mode 100755 index 00000000..bba3c8b6 --- /dev/null +++ b/build/scripts-3.10/getFRiF.py @@ -0,0 +1,207 @@ +#!/home/caoy7/anaconda3/envs/astroBoy/bin/python +#--coding:utf-8 -- +""" +getFRiF.py +Get the fraction of PETs in target features. +""" +__date__ = "2020-01-09" +__modified__ = "" +__email__ = "caoyaqiang0410@gmail.com" + +#general library +import os, argparse, json +from glob import glob +from datetime import datetime +from argparse import RawTextHelpFormatter + +#3rd library +import numpy as np +import pandas as pd +from tqdm import tqdm +from joblib import Parallel, delayed + +#cLoops +from cLoops2.ds import XY, Loop +from cLoops2.io import parseIxy, parseTxt2Loops +from cLoops2.settings import * + + +def help(): + """ + Create the command line interface for the script. + """ + description = """ + Get the fractrion of reads in target features such as domains + and peaks annotated with .bed file or domains/stripes/loops with + .txt file such as the _loop.txt file. + + getFRiF.py -d GM12878_Trac -b tad.bed + """ + parser = argparse.ArgumentParser(description=description, + formatter_class=RawTextHelpFormatter) + parser.add_argument("-d", + dest="predir", + required=True, + type=str, + help="Directory for cLoops2 pre generated.") + parser.add_argument( + "-b", + dest="bed", + required=False, + default="", + type=str, + help="The .bed annotated the features such peaks/domains.") + parser.add_argument("-l", + dest="floop", + required=False, + type=str, + default="", + help="The _loop.txt .file generated by cLoops2.") + parser.add_argument("-o", + dest="output", + required=True, + type=str, + help="Output prefix.") + parser.add_argument( + "-single", + dest="single", + required=False, + default=False, + action="store_true", + help= + "Whether to treat paired-end as single when check the location in -bed features, default is False, which means the two ends has to be in the same target regions, then it counts one. Set this when -bed features are peaks." + ) + + parser.add_argument( + "-cut", + dest="cut", + type=int, + default=0, + help= + "Distance cutoff for PETs to filter, default is 0. Can be set as the estimated self-ligation distance cutoff." + ) + parser.add_argument('-p', + dest="cpu", + required=False, + default=1, + type=int, + help="Number of CPUs to run the job, default is 1.") + op = parser.parse_args() + return op + + +def get1DFRiF(key, fixy, peaks, cut=0, single=False): + """ + Get reads ratio within 1D peaks/domains. + """ + key2, mat = parseIxy(fixy, cut=cut) + if mat.shape[0] == 0: + print( + "No PETs found in %s maybe due to distance cutoff for PET > %s." % + (fixy, cut)) + xy = XY(mat[:, 0], mat[:, 1]) + if single: + print("Getting the single end tags target %s 1d features from %s." % + (len(peaks), key)) + else: + print("Getting the paired end tags target %s 1d features from %s." % + (len(peaks), key)) + ns = 0 + for peak in tqdm(peaks): + if single: + ns += len(xy.queryPeak(peak[1], peak[2])) + else: + ns += len(xy.queryPeakBoth(peak[1], peaks[2])) + if single: + print("%s: total %s single end tags, %s (%.3f) in target." % + (key, mat.shape[0] * 2, ns, float(ns) / mat.shape[0] / 2)) + else: + print("%s: total %s paired end tags, %s (%.3f) in target." % + (key, mat.shape[0], ns, float(ns) / mat.shape[0])) + return ns + + +def get2DFRiF(key, fixy, loops, cut=0): + """ + Get reads ratio within 2D loops/domains. + """ + key2, mat = parseIxy(fixy, cut=cut) + if mat.shape[0] == 0: + print( + "No PETs found in %s maybe due to distance cutoff for PET > %s." % + (fixy, cut)) + xy = XY(mat[:, 0], mat[:, 1]) + ns = 0 + for loop in tqdm(loops): + ns += len( + xy.queryLoop(loop.x_start, loop.x_end, loop.y_start, + loop.y_end)[-1]) + print("%s: total %s paired end tags, %s (%.3f) in target." % + (key, mat.shape[0], ns, float(ns) / mat.shape[0])) + return ns + + +def main(): + op = help() + if op.bed == "" and op.floop == "": + print("No 1D or 2D features assigned as input, return.") + return + metaf = op.predir + "/petMeta.json" + meta = json.loads(open(metaf).read()) + keys = list(meta["data"]["cis"].keys()) + keys.extend(list(meta["data"]["trans"].keys())) + if op.bed != "" and os.path.exists(op.bed): + peaks = {} + for line in open(op.bed): + line = line.split("\n")[0].split("\t") + if len(line) < 3: + continue + key = line[0] + "-" + line[0] + if key not in peaks: + peaks[key] = [] + line[1] = int(line[1]) + line[2] = int(line[2]) + peaks[key].append(line) + ds = Parallel(n_jobs=op.cpu,backend="multiprocessing")(delayed(get1DFRiF)( + key, + meta["data"]["cis"][key]["ixy"], + peaks[key], + cut=op.cut, + single=op.single, + ) for key in keys if key in peaks) + #ds = [d[0] for d in ds] + s = np.sum(ds) + t = meta["Unique PETs"] + if op.single: + t = t * 2 + ds = { + "total": t, + "inTraget": s, + "ratio": float(s) / t, + "single": op.single + } + ds = pd.Series(ds) + ds.to_csv(op.output + "_1d_FRiF.txt", sep="\t") + + if op.floop != "" and os.path.exists(op.floop): + loops = parseTxt2Loops(op.floop) + ds = Parallel(n_jobs=op.cpu,backend="multiprocessing")(delayed(get2DFRiF)( + key, + meta["data"]["cis"][key]["ixy"], + loops[key], + cut=op.cut, + ) for key in keys if key in loops) + #ds = [d[0] for d in ds] + s = np.sum(ds) + t = meta["Unique PETs"] + ds = { + "total": t, + "inTraget": s, + "ratio": float(s) / t, + } + ds = pd.Series(ds) + ds.to_csv(op.output + "_2d_FRiF.txt", sep="\t") + + +if __name__ == "__main__": + main() diff --git a/build/scripts-3.10/getIS.py b/build/scripts-3.10/getIS.py new file mode 100755 index 00000000..6e638d74 --- /dev/null +++ b/build/scripts-3.10/getIS.py @@ -0,0 +1,170 @@ +#!/home/caoy7/anaconda3/envs/astroBoy/bin/python +#--coding:utf-8 -- +""" +getIS.py +cLoops2 calcIS.py caculation the insulation score according to https://www.nature.com/articles/nature20158 +X(x,s) = number of contacts between any pair of elements in the interval (x − s, x + s ) + +#X(x,s) = - log2{(X(x,s) − X(x + s/2,s/2) − X(x − s/2, s/2))/X(x,s)/0.5} +modified as X(x,s) = (X(x,s) − X(x + s/2,s/2) − X(x − s/2, s/2))/(X(x+s/2,s/2)+X(x-s/2,s/2) +directly compare the spanning and independent reads + +""" +__date__ = "2019-09-11" +__modified__ = "" +__email__ = "caoyaqiang0410@gmail.com" + +#general library +import os, argparse +from glob import glob +from collections import Counter +from datetime import datetime +from argparse import RawTextHelpFormatter + +#3rd library +import numpy as np +import pandas as pd +from tqdm import tqdm + +#cLoops2 +from cLoops2.ds import XY +from cLoops2.io import parseIxy +from cLoops2.settings import * + + +def help(): + """ + Create the command line interface for the script. + """ + description = """ + Caculate the insulation score for a specific region. + The output .bdg is the bedGraph result for the regions with insulation score. + IS is defined as accroding to the formula in following paper: + Capturing pairwise and multi-way chromosomal conformations using chromosomal walks + + Example: + getIS.py -f GM12878_Trac/chr21-chr21.ixy -o GM12878_Trac_chr21 + """ + parser = argparse.ArgumentParser(description=description, + formatter_class=RawTextHelpFormatter) + parser.add_argument( + "-f", + dest="fixy", + required=True, + type=str, + help= + "Input .ixy file generated by cLoops2 to caculate insulation score.") + parser.add_argument("-o", + dest="output", + required=True, + type=str, + help="Output prefix.") + parser.add_argument( + "-start", + dest="start", + required=False, + default=-1, + type=int, + help= + "Start genomic coordinate for the target region, default is the minmial corrdinate found in the file." + ) + parser.add_argument( + "-end", + dest="end", + required=False, + default=-1, + type=int, + help= + "End genomic coordinate for the target region, default is the maxmial corrdinate found in the file." + ) + parser.add_argument( + "-bs", + dest="binSize", + required=False, + default=5000, + type=int, + help="Bin size (bp) to generate the contact matrix, default is 5000 bp." + ) + parser.add_argument( + "-s", + dest="step", + required=False, + default=100000, + type=int, + help= + "The upstream and downstream extension to caculate insulaiton score, default is 100000 bp." + ) + parser.add_argument( + "-cut", + dest="cut", + type=int, + default=0, + help="PETs with distance > cut will be kept, default is 0.") + parser.add_argument( + "-mcut", + dest="mcut", + type=int, + default=-1, + help="PETs with distance < mcut will be kept, default is -1 no limit.") + op = parser.parse_args() + return op + + +def calcIS(f, fout, start=-1, end=-1, bs=10000, step=100000,cut=0,mcut=-1): + """ + Calculation of insulation score, output as .bedGraph file. + """ + print("loading %s" % f) + key, mat = parseIxy(f, cut=cut,mcut=mcut) + xy = XY(mat[:, 0], mat[:, 1]) + if key[0] != key[1]: + print( + "IS can be only caculated for intra-chromosomal interactions. Return." + ) + return + if start == -1: + start = np.min(xy.xs) + step + if end == -1: + end = np.max(xy.ys) - step + bins = int((end - start) / bs) + print("caculating from %s to %s of %s bins" % (start, end, bins)) + ss = [] + ds = [] + for i in tqdm(range(bins)): + x = start + i * bs + xc = len(xy.queryPeakBoth(x - step, x + step)) + if xc == 0: + continue + xcright = len(xy.queryPeakBoth(x, x + step)) + xcleft = len(xy.queryPeakBoth(x - step, x)) + if xcright + xcleft == 0: + continue + xcbridge = xc - xcright - xcleft + s = xcbridge / (xcright + xcleft) + line = [key[0], x, x + bs, s] + ds.append( line ) + ss.append( s ) + ss = np.array(ss) + ss = (ss - np.mean(ss))/np.std(ss) + for i in range(len(ds)): + ds[i][-1] = ss[i] + with open(fout + ".bdg", "w") as fo: + for line in ds: + fo.write("\t".join(list(map(str, line))) + "\n") + + +def main(): + op = help() + calcIS(op.fixy, + op.output, + start=op.start, + end=op.end, + bs=op.binSize, + step=op.step, + cut=op.cut, + mcut=op.mcut, + ) + + +if __name__ == "__main__": + main() diff --git a/build/scripts-3.10/getIntDensity.py b/build/scripts-3.10/getIntDensity.py new file mode 100755 index 00000000..a8d039c1 --- /dev/null +++ b/build/scripts-3.10/getIntDensity.py @@ -0,0 +1,152 @@ +#!/home/caoy7/anaconda3/envs/astroBoy/bin/python +#--coding:utf-8 -- +""" +getIntDensity.py +Get the interaction density for a region. +""" + +__date__ = "2019-10-08" +__modified__ = "" +__email__ = "caoyaqiang0410@gmail.com" + +#general library +import os +import sys +import json +import argparse +from glob import glob +from datetime import datetime +from argparse import RawTextHelpFormatter + +#3rd library +import numpy as np +import pandas as pd +from tqdm import tqdm +from joblib import Parallel, delayed +from scipy.stats import hypergeom, binom, poisson + +#cLoops2 +from cLoops2.ds import XY +from cLoops2.io import parseTxt2Loops, ixy2pet +from cLoops2.callCisLoops import getPerRegions, estAnchorSig +from cLoops2.settings import * + + +def help(): + """ + Create the command line interface for the script of getAggLoopsPlot.py. + """ + description = """ + Get the interaction density for regions. + """ + parser = argparse.ArgumentParser(description=description, + formatter_class=RawTextHelpFormatter) + parser.add_argument("-d", + dest="predir", + required=True, + type=str, + help="Directory for cLoops2 pre generated.") + parser.add_argument( + "-b", + dest="fbed", + required=True, + type=str, + help= + "The .bed file which contains regions to get the interaction density.") + parser.add_argument("-o", + dest="output", + required=True, + type=str, + help="Output prefix.") + parser.add_argument( + "-pcut", + dest="pcut", + type=int, + default=0, + help= + "Distance cutoff for PETs to filter, default is 0. Can be set as the estimated self-ligation distance cutoff." + ) + parser.add_argument('-p', + dest="cpu", + required=False, + default=1, + type=int, + help="Number of CPUs to run the job, default is 1.") + op = parser.parse_args() + return op + + +def quantifyRegions(key, rs, fixy, pcut=0, pseudo=1): + """ + @param key: str, such as chr21-chr21 + @param loops: list of Loop object + @param fixy: cLoops2 pre generated .ixy file + """ + print("%s\t quantify interaction density of %s regions in %s." % + (datetime.now(), len(rs), key)) + xy = ixy2pet(fixy, cut=pcut) + N = xy.number + ds = {} + for r in tqdm(rs): + local = xy.queryPeakBoth(int(r[1]), int(r[2])) + a = xy.queryPeak(int(r[1]), int(r[2])) + distal = a.difference(local) + ds["|".join(r)] = { + "chrom": + r[0], + "start": + r[1], + "end": + r[2], + "name": + r[3], + "allPETs": + len(local) * 2 + len(distal), + "localPETs": + len(local) * 2, + "distalPETs": + len(distal), + "allRPKM": (len(local) * 2 + len(distal)) / + (int(r[2]) - int(r[1])) / N / 2 * 10**9, + "localRPKM": + len(local) * 2 / (int(r[2]) - int(r[1])) / N / 2 * 10**9, + "distalRPKM": + len(distal) * 2 / (int(r[2]) - int(r[1])) / N / 2 * 10**9, + } + return ds + + +def parseBed(f): + regions = {} + for line in open(f): + line = line.split("\n")[0].split("\t") + key = line[0] + "-" + line[0] + if key not in regions: + regions[key] = [] + regions[key].append(line) + return regions + + +def main(): + op = help() + regions = parseBed(op.fbed) + metaf = op.predir + "/petMeta.json" + meta = json.loads(open(metaf).read()) + keys = list(meta["data"]["cis"].keys()) + keys = list(set(keys).intersection(set(regions.keys()))) + ds = Parallel(n_jobs=op.cpu,backend="multiprocessing")(delayed(quantifyRegions)( + key, + regions[key], + meta["data"]["cis"][key]["ixy"], + pcut=op.pcut, + ) for key in keys) + data = {} + for d in ds: + for k, v in d.items(): + data[k] = v + data = pd.DataFrame(data).T + data.to_csv(op.output + "_quant.txt", sep="\t", index_label="rid") + + +if __name__ == "__main__": + main() diff --git a/build/scripts-3.10/getLocalIDS.py b/build/scripts-3.10/getLocalIDS.py new file mode 100755 index 00000000..a7e767df --- /dev/null +++ b/build/scripts-3.10/getLocalIDS.py @@ -0,0 +1,153 @@ +#!/home/caoy7/anaconda3/envs/astroBoy/bin/python +#--coding:utf-8 -- +""" +getLocalIDS.py +Caculate the interaction density score with small window size as 1kb. + + +""" +__date__ = "2019-09-12" +__modified__ = "" +__email__ = "caoyaqiang0410@gmail.com" + +#general library +import os +import argparse +from glob import glob +from collections import Counter +from datetime import datetime +from argparse import RawTextHelpFormatter + +#3rd library +import numpy as np +import pandas as pd +from tqdm import tqdm + +#cLoops2 +from cLoops2.ds import XY +from cLoops2.io import parseIxy +from cLoops2.settings import * + + +def help(): + """ + Create the command line interface for the script of calcLocalIDS.py + """ + description = """ + Caculate the interaction density score for a specific regions. + The output .bdg is the bedGraph result for the regions with insulation score. + Example: + getLocalIDS.py -f GM12878_Trac/chr21-chr21.ixy -o GM12878_Trac_chr21 + """ + parser = argparse.ArgumentParser(description=description, + formatter_class=RawTextHelpFormatter) + parser.add_argument( + "-f", + dest="fixy", + required=True, + type=str, + help= + "Input .ixy file generated by cLoops2 to caculate insulation score.") + parser.add_argument("-o", + dest="output", + required=True, + type=str, + help="Output prefix.") + parser.add_argument( + "-start", + dest="start", + required=False, + default=-1, + type=int, + help= + "Start genomic coordinate for the target region, default is the minmial corrdinate found in the file." + ) + parser.add_argument( + "-end", + dest="end", + required=False, + default=-1, + type=int, + help= + "End genomic coordinate for the target region, default is the maxmial corrdinate found in the file." + ) + parser.add_argument( + "-bs", + dest="binSize", + required=False, + default=1000, + type=int, + help="Bin size (bp) to generate the contact matrix, default is 1000 bp." + ) + parser.add_argument( + "-ext", + dest="ext", + required=False, + default=10, + type=int, + help= + "The extension fold of the target region to show the interactions around. Default is 10." + ) + parser.add_argument( + "-cut", + dest="cut", + required=False, + default=0, + type=int, + help= + "Filtering PETs with distance < cut. Default is 0 without filtering.") + + op = parser.parse_args() + return op + + +def calcIDS(f, fout, start=-1, end=-1, bs=1000, ext=10, cut=0): + """ + Caculate the interaction density score for a region. + """ + print("loading %s" % f) + key, mat = parseIxy(f, cut=cut) + xy = XY(mat[:, 0], mat[:, 1]) + if key[0] != key[1]: + print( + "IS can be only caculated for intra-chromosomal interactions. Return." + ) + return + if start == -1: + start = np.min(xy.xs) + bs * ext / 2 + if end == -1: + end = np.max(xy.ys) - bs * ext / 2 + bins = int((end - start) / bs) + + print("caculating from %s to %s of %s bins" % (start, end, bins)) + with open(fout + "_ids.bdg", "w") as fo: + for i in tqdm(range(bins)): + x = start + i * bs + y = start + (i + 1) * bs + r = 0 + for j in range(int(-ext / 2), int(ext / 2 + 1)): + if j == 0: + continue + s = x + j * bs + e = y + j * bs + ra, rb, rab = xy.queryLoop(x, y, s, e) + #print(x,y,s,e,rab) + r += len(rab) + r = r / xy.number * 10**6 + line = [key[0], x, y, r] + fo.write("\t".join(list(map(str, line))) + "\n") + + +def main(): + op = help() + calcIDS(op.fixy, + op.output, + start=op.start, + end=op.end, + bs=op.binSize, + ext=op.ext, + cut=op.cut) + + +if __name__ == "__main__": + main() diff --git a/build/scripts-3.10/getPETsAno.py b/build/scripts-3.10/getPETsAno.py new file mode 100755 index 00000000..ba5a63b8 --- /dev/null +++ b/build/scripts-3.10/getPETsAno.py @@ -0,0 +1,194 @@ +#!/home/caoy7/anaconda3/envs/astroBoy/bin/python +#--coding:utf-8 -- +""" +getPETsAno.py +Get the annotation of PETs for enhancer/promoter and plot the stats. +""" + +__date__ = "2019-10-11" +__modified__ = "" +__email__ = "caoyaqiang0410@gmail.com" + +#general library +import os +import sys +import json +import argparse +from glob import glob +from datetime import datetime +from argparse import RawTextHelpFormatter + +#3rd library +import numpy as np +import pandas as pd +from tqdm import tqdm +from joblib import Parallel, delayed + +#cLoops2 +from cLoops2.ds import XY +from cLoops2.io import parseIxy +from cLoops2.settings import * + + +def help(): + """ + Create the command line interface for the script. + """ + description = """ + Get the annotations of PETs located for enhancer-promoter, enhancer-enhancer, + promoter-promoter inteaction ratios. + Example: + getPETsAno.py -d GM12878_Trac -e enhancer.bed -p promoter.bed -cut 10000 -o GM12878_Trac_PETs_ano + """ + parser = argparse.ArgumentParser(description=description) + #formatter_class=RawTextHelpFormatter) + parser.add_argument("-d", + dest="predir", + required=True, + type=str, + help="Directory for cLoops2 pre generated.") + parser.add_argument("-enhancer", + dest="fe", + required=True, + type=str, + help="The enhancer annotation bed file.") + parser.add_argument("-promoter", + dest="fp", + required=True, + type=str, + help="The enhancer annotation bed file.") + parser.add_argument("-o", + dest="output", + required=True, + type=str, + help="Output prefix.") + parser.add_argument("-p", + dest="cpu", + required=False, + type=int, + default=1, + help="Number of CPU to run the job. Default is 1.") + parser.add_argument( + "-pcut", + dest="pcut", + type=int, + default=0, + help= + "Distance cutoff for PETs to filter, default is 0. Can be set as the estimated self-ligation distance cutoff. " + ) + op = parser.parse_args() + return op + + +def buildFeature(f): + """ + For the features in bed file, map to genomic location easy for access. + For the genomic region, if has feature, it will be shown as True. + """ + print("%s\tBuilding coverage features of %s." % (datetime.now(), f)) + cov = {} + for line in open(f): + line = line.split("\n")[0].split("\t") + if len(line) < 3: + continue + chrom = line[0] + start = int(line[1]) + end = int(line[2]) + if chrom not in cov: + cov[chrom] = set() + #if len( cov[chrom] ) < end: + # cov[chrom].extend( [False] * (end-len(cov[chrom])+1)) + #for i in range( start,end): + # cov[chrom][i] = True + cov[chrom].update(range(start, end)) + return cov + + +def findFeature(cov, chrom, start, end): + """ + Judge if the target region overlap with features. + """ + if chrom not in cov: + return False + for t in range(start, end): + if t in cov[chrom]: + return True + return False + + +def annotatePETs(key, fixy, ecov, pcov, cut=0, ext=50): + """ + Annotate the PETs to enhancer, promoter or Other. + """ + print("%s\tAnnotating PETs for %s with cut > %s." % + (datetime.now(), key, cut)) + ep = 0 #enhancer promoter + ee = 0 #enhancer enhancer + pp = 0 #promoter promoter + en = 0 #enhancer none + pn = 0 #promoter none + nn = 0 #none none + key2, mat = parseIxy(fixy, cut=cut) + if key2[0] not in ecov and key2[0] not in pcov and key2[ + 1] not in ecov and key2[1] not in pcov: + return None + for x, y in tqdm(mat): + fae = findFeature(ecov, key2[0], x - ext, x + ext) + fap = findFeature(pcov, key2[0], x - ext, x + ext) + fbe = findFeature(ecov, key2[1], y - ext, y + ext) + fbp = findFeature(pcov, key2[1], y - ext, y + ext) + if fae == True and fbp == True: + ep += 1 + elif fap == True and fbe == True: + ep += 1 + elif fae == True and fbe == True: + ee += 1 + elif fap == True and fbp == True: + pp += 1 + elif fae == True and fbe == False and fbp == False: + en += 1 + elif fbe == True and fae == False and fap == False: + en += 1 + elif fap == True and fbe == False and fbp == False: + pn += 1 + elif fbp == True and fae == False and fap == False: + pn += 1 + else: + nn += 1 + ss = { + "Enhancer-Promoter": ep, + "Enhancer-Enhancer": ee, + "Promoter-Promoter": pp, + "Enhancer-Other": en, + "Promoter-Other": pn, + "Other-Other": nn, + } + return ss + + +def main(): + op = help() + metaf = op.predir + "/petMeta.json" + meta = json.loads(open(metaf).read()) + ecov = buildFeature(op.fe) + pcov = buildFeature(op.fp) + keys = list(meta["data"]["cis"].keys()) + ds = Parallel(n_jobs=1,backend="multiprocessing")(delayed(annotatePETs)( + #ds = Parallel(n_jobs=op.cpu,backend="multiprocessing")(delayed(annotatePETs)( + key, meta["data"]["cis"][key]["ixy"], ecov, pcov, cut=op.pcut) + for key in keys) + ss = {} + for d in ds: + if d is None: + continue + for k, v in d.items(): + if k not in ss: + ss[k] = v + else: + ss[k] += v + ss = pd.Series(ss) + ss.to_csv("%s_PETs_anos.txt" % op.output, sep="\t") + + +if __name__ == "__main__": + main() diff --git a/build/scripts-3.10/getSS.py b/build/scripts-3.10/getSS.py new file mode 100755 index 00000000..e44ba9c9 --- /dev/null +++ b/build/scripts-3.10/getSS.py @@ -0,0 +1,155 @@ +#!/home/caoy7/anaconda3/envs/astroBoy/bin/python +#--coding:utf-8 -- +""" +getSS.py +cLoops2 getSS.py caculation the segregation score for domain-centric analysis. + +""" +__date__ = "2022-10-17" +__modified__ = "" +__email__ = "caoyaqiang0410@gmail.com" + +#general library +import os, argparse +from glob import glob +from collections import Counter +from datetime import datetime +from argparse import RawTextHelpFormatter + +#3rd library +import numpy as np +import pandas as pd +from tqdm import tqdm +from joblib import Parallel, delayed + +#cLoops2 +from cLoops2.ds import XY,Domain +from cLoops2.io import parseIxy, doms2txt, doms2bed +from cLoops2.cmat import getObsMat, xy2dict, dict2mat +from cLoops2.settings import * + + + +def help(): + """ + Create the command line interface for the script. + """ + description = """ + Caculate the segregation score for a specific region. + The output .bdg is the bedGraph result for the regions with segregation score. + + Example: + getSS.py -f GM12878_Trac/chr21-chr21.ixy -o GM12878_Trac_chr21 + """ + parser = argparse.ArgumentParser(description=description, + formatter_class=RawTextHelpFormatter) + parser.add_argument( + "-f", + dest="fixy", + required=True, + type=str, + help= + "Input .ixy file generated by cLoops2 to caculate segregation score.") + parser.add_argument("-o", + dest="output", + required=True, + type=str, + help="Output prefix.") + parser.add_argument( + "-bs", + dest="binSize", + required=False, + default=1000, + type=int, + help="Bin size (bp) to generate the contact matrix, default is 1000 bp." + ) + parser.add_argument( + "-ws", + dest="winSize", + required=False, + default=50000, + type=int, + help= + "The half of the sliding window size used to caculate local correlation, default is 50000 (50kb)." + ) + parser.add_argument( + "-cut", + dest="cut", + type=int, + default=0, + help="PETs with distance > cut will be kept, default is 0.") + parser.add_argument( + "-mcut", + dest="mcut", + type=int, + default=-1, + help="PETs with distance < mcut will be kept, default is -1 no limit.") + op = parser.parse_args() + return op + + + +def calcSS(f, fout, bs=20000, winSize=500000, cut=0,mcut=-1): + """ + Calculation of segregation score, output as .bedGraph file. + @param bs: bin size + @param winSize: sliding matrix width half size + @param cut: distance cutoff for PETs + """ + key, mat = parseIxy(f, cut=cut,mcut=mcut) + matstart = np.min(mat) + matend = np.max(mat) + start = matstart + winSize + end = matend - winSize + bins = int((end - start) / bs) + #convert to sparse contact matrix + mat = xy2dict(mat, s=matstart, e=matend, r=bs) + mat = dict2mat(mat) + print( + "caculating from %s to %s of %s bins for segregation score with bin size of %s and window size of %s" + % (start, end, bins, bs,winSize)) + rs = [] + ss = [] + for i in tqdm(range(bins)): + x = start + i * bs + s = x - winSize + e = x + winSize + #releative position in contact matrix + s = int( (s - matstart)/bs ) + e = int( (e - matstart)/bs ) +1 + nmat = mat[s:e,s:e].toarray() + #previous + #nmat = getObsMat(mat, s, e, bs) + nmat = np.log2(nmat + 1) + nmat = np.corrcoef(nmat) + nmat = np.nan_to_num(nmat) + nmat = nmat[int(nmat.shape[0] / 2) + 1:, :int(nmat.shape[1] / 2)] + nmat[nmat < 0] = 0 + s = nmat.mean() + ss.append(s) + r = [key[0], x, x + bs] + rs.append(r) + ss = np.array(ss) + ss = (ss - np.mean(ss))/np.std(ss) + for i, r in enumerate(rs): + r.append(ss[i]) + with open(fout +"_SS.bdg", "w") as fo: + for r in rs: + fo.write("\t".join(list(map(str, r))) + "\n") + + + + +def main(): + op = help() + calcSS(op.fixy, + op.output, + bs=op.binSize, + winSize=op.winSize, + cut=op.cut, + mcut=op.mcut, + ) + + +if __name__ == "__main__": + main() diff --git a/build/scripts-3.10/getSigDist.py b/build/scripts-3.10/getSigDist.py new file mode 100755 index 00000000..9334e3ba --- /dev/null +++ b/build/scripts-3.10/getSigDist.py @@ -0,0 +1,223 @@ +#!/home/caoy7/anaconda3/envs/astroBoy/bin/python +#--coding:utf-8 -- +""" +getSigDist.py +check interaction signal distribution. +""" +__date__ = "2020-01-08" +__modified__ = "" +__email__ = "caoyaqiang0410@gmail.com" + +#general library +import os +import argparse +from glob import glob +from datetime import datetime +from argparse import RawTextHelpFormatter + +#3rd library +import numpy as np +import pandas as pd +from tqdm import tqdm +from joblib import Parallel, delayed + +#cLoops +from cLoops2.io import parseIxy +from cLoops2.settings import * + + +def help(): + """ + Create the command line interface. + """ + description = """ + Get the observed/expected interaction signal distribution in contact matrix. + Example: + getSigDist.py -d GM12878_Trac -o GM12878_Trac -cut 0 -p 10 + """ + parser = argparse.ArgumentParser(description=description, + formatter_class=RawTextHelpFormatter) + parser.add_argument("-d", + dest="dir", + required=True, + type=str, + help="Directory for cLoops2 pre generated.") + parser.add_argument("-o", + dest="output", + required=True, + type=str, + help="Output prefix.") + parser.add_argument( + "-c", + dest="chroms", + required=False, + default="", + type=str, + help= + "Whether to process limited chroms, specify it as chr1,chr2,chr3, default is not. Use this to save time for quite big data." + ) + parser.add_argument( + "-bs", + dest="binSize", + required=False, + default=1000, + type=int, + help= + "Bin size (bp) to generate the contact matrix for estimation, default is 1000 bp." + ) + parser.add_argument( + "-cut", + dest="cut", + type=int, + default=0, + help="Distance cutoff for PETs to filter, default is 0.") + parser.add_argument('-p', + dest="cpu", + required=False, + default=1, + type=int, + help="Number of CPUs to run the job, default is 1.") + parser.add_argument( + '-r', + dest="repeats", + required=False, + default=0, + type=int, + help= + "The reapet times to shuffle PETs to get the mean expected background,default is 0, set larger than 1 to get the expected result." + ) + parser.add_argument('-plot', + dest="plot", + required=False, + action="store_true", + help="Set to plot the result.") + parser.add_argument( + '-log', + dest="log", + required=False, + action="store_true", + help= + "Whether log transform the PETs in bins for plotting, set to transform." + ) + op = parser.parse_args() + return op + + +def getObsPETs(mat, binSize=1000): + """ + Get the number of PETs in bins. + @param mat: [[x,y]] + @param binSize:int, contact matrix bin size + """ + minC = np.min(mat) + a = (mat[:, 0] - minC) / binSize + b = (mat[:, 1] - minC) / binSize + a = a.astype(int) + b = b.astype(int) + ss = {} + for i in range(len(a)): + x = a[i] + y = b[i] + if x not in ss: + ss[x] = {} + if y not in ss[x]: + ss[x][y] = 0 + ss[x][y] += 1 + sso = [] + for x in ss.keys(): + for y in ss[x].keys(): + sso.append(ss[x][y]) + return sso + + +def preObs(f, cut=0, binSize=1000): + chrom, mat = parseIxy(f, cut=cut) + return getObsPETs(mat, binSize=binSize) + + +def preExp(f, cut=0, binSize=1000): + chrom, mat = parseIxy(f, cut=cut) + #shuffle data + a = mat[:, 0] + b = mat[:, 1] + np.random.shuffle(a) + np.random.shuffle(b) + mat[:, 0] = a + mat[:, 1] = b + return getObsPETs(mat, binSize=binSize) + + +def plotObsExpSigDist(so, fout, binSize=1000, cut=0, se=None, log=False): + """ + Plot the signal enrichment. + """ + if log: + so = np.log10(so) + if se is not None: + se = np.log10(se) + fig, ax = pylab.subplots() + sns.kdeplot(so, color=colors[0], label="observed", ax=ax) + if se is not None: + sns.kdeplot(se, color=colors[1], label="expected", ax=ax) + if log: + xlabel = "log10(PETs) in bins" + else: + xlabel = "PETs in bins" + ax.set_xlabel(xlabel) + ax.set_ylabel("density") + ax.legend(loc="upper left") + ax.set_title("%s resolution contact matrix with PETs distance > %s" % + (binSize, cut)) + pylab.savefig("%s.pdf" % (fout + "_sigDist")) + + +def main(): + op = help() + if op.chroms == "": + chroms = [] + else: + chroms = set(op.chroms.split(",")) + fs = glob("%s/*.ixy" % op.dir) + nfs = [] + for f in fs: + n = f.split("/")[-1].split(".ixy")[0].split("-") + if len(chroms) == 0: + nfs.append(f) + if len(chroms) > 0 and n[0] in chroms and n[1] in chroms: + nfs.append(f) + fs = nfs + + print("%s \t Getting the observed signal distribution for %s." % + (datetime.now(), op.dir)) + ds = Parallel(n_jobs=op.cpu,backend="multiprocessing")( + delayed(preObs)(f, cut=op.cut, binSize=op.binSize) for f in tqdm(fs)) + ds = np.concatenate(ds) + with open(op.output + "_obs_sigDis.txt", "w") as fo: + fo.write(",".join(list(map(str, ds))) + "\n") + if op.repeats > 0: + print("%s \t Getting the expected signal distribution for %s." % + (datetime.now(), op.dir)) + for i in tqdm(range(op.repeats)): + eds = Parallel(n_jobs=op.cpu,backend="multiprocessing")( + delayed(preExp)(f, cut=op.cut, binSize=op.binSize) for f in fs) + eds = list(np.concatenate(eds)) + if i == 0: + nnp = eds + else: + nnp.extend(eds) + with open(op.output + "_exp_sigDis.txt", "w") as fo: + fo.write(",".join(list(map(str, nnp))) + "\n") + else: + nnp = None + + if op.plot: + plotObsExpSigDist(ds, + op.output, + binSize=op.binSize, + cut=op.cut, + se=nnp, + log=op.log) + + +if __name__ == "__main__": + main() diff --git a/build/scripts-3.10/getSigEnrich.py b/build/scripts-3.10/getSigEnrich.py new file mode 100755 index 00000000..57d2d007 --- /dev/null +++ b/build/scripts-3.10/getSigEnrich.py @@ -0,0 +1,236 @@ +#!/home/caoy7/anaconda3/envs/astroBoy/bin/python +#--coding:utf-8 -- +""" +getSigEnrich.py +Get the enrichment of interaction signals, just like the fingerprint plot for the ChIP-seq. +""" +__date__ = "2019-08-26" +__modified__ = "" +__email__ = "caoyaqiang0410@gmail.com" + +#general library +import os +import sys +import argparse +from glob import glob +from collections import Counter +from datetime import datetime +from argparse import RawTextHelpFormatter + +#3rd library +import numpy as np +import pandas as pd +from tqdm import tqdm +from joblib import Parallel, delayed + +#cLoops2 +from cLoops2.io import parseIxy +from cLoops2.settings import * + + +def help(): + """ + Create the command line interface for the script of estSigEnrich.py. + """ + description = """ + Get the observed and expected enrichment trend plot based on contact matrix. + Example: + getSigEnrich.py -d GM12878_Trac -o GM12878_Trac -cut 0 -p 10 + """ + parser = argparse.ArgumentParser(description=description, + formatter_class=RawTextHelpFormatter) + parser.add_argument("-d", + dest="dir", + required=True, + type=str, + help="Directory for cLoops2 pre generated.") + parser.add_argument("-o", + dest="output", + required=True, + type=str, + help="Output prefix.") + parser.add_argument( + "-c", + dest="chroms", + required=False, + default="", + type=str, + help= + "Whether to process limited chroms, specify it as chr1,chr2,chr3, default is not. Use this to save time for quite big data." + ) + parser.add_argument( + "-bs", + dest="binSize", + required=False, + default=5000, + type=int, + help= + "Bin size (bp) to generate the contact matrix for estimation, default is 5000 bp." + ) + parser.add_argument( + "-cut", + dest="cut", + type=int, + default=0, + help="Distance cutoff for PETs to filter, default is 0.") + parser.add_argument('-p', + dest="cpu", + required=False, + default=1, + type=int, + help="Number of CPUs to run the job, default is 1.") + parser.add_argument( + '-r', + dest="repeats", + required=False, + default=10, + type=int, + help= + "The reapet times to shuffle PETs to get the mean expected background,default is 10." + ) + parser.add_argument('-plot', + dest="plot", + required=False, + action="store_true", + help="Set to plot the result.") + op = parser.parse_args() + return op + + +def getObsPETs(mat, binSize=5000): + """ + Get the number of PETs in bins. + @param mat: [[x,y]] + @param binSize:int, contact matrix bin size + """ + minC = np.min(mat) + a = (mat[:, 0] - minC) / binSize + b = (mat[:, 1] - minC) / binSize + a = a.astype(int) + b = b.astype(int) + ss = {} + for i in range(len(a)): + x = a[i] + y = b[i] + if x not in ss: + ss[x] = {} + if y not in ss[x]: + ss[x][y] = 0 + ss[x][y] += 1 + sso = [] + for x in ss.keys(): + for y in ss[x].keys(): + sso.append(ss[x][y]) + return sso + + +def getSortBins(ds, bins=100): + """ + Furthur bin the signal in contact matrix into bins, only care of the cumutative trend. + """ + #default is ascending sort + ds = np.sort(ds) + #bin the contacts into 100 bins for comparing signal enrichment between samples + nn = [] + step = int(len(ds) / bins) + for i in range(0, len(ds), step): + if i + step > len(ds): + break + nn.append(ds[i:i + step].sum()) + nn = np.array(nn) + nn = np.cumsum(nn) / float(nn.sum()) + return nn + + +def preObs(f, cut=0, binSize=5000): + chrom, mat = parseIxy(f, cut=cut) + return getObsPETs(mat, binSize=binSize) + + +def preExp(f, cut=0, binSize=5000): + chrom, mat = parseIxy(f, cut=cut) + #shuffle data + a = mat[:, 0] + b = mat[:, 1] + np.random.shuffle(a) + np.random.shuffle(b) + mat[:, 0] = a + mat[:, 1] = b + return getObsPETs(mat, binSize=binSize) + + +def plotObsExpSigEnrichment(f): + """ + Plot the signal enrichment. + """ + mat = pd.read_csv(f, sep="\t", index_col=0) + fig, ax = pylab.subplots() + ax.plot(mat.index, + mat["observed"] * 100, + color=colors[0], + label="observed") + ax.plot(mat.index, + mat["expected"] * 100, + color=colors[1], + label="expected") + ax.set_xlabel("Percentage of Bins") + ax.set_ylabel("Percetange of PETs") + ax.legend(loc="upper left") + #ax2 = ax.twinx() + #ax2.plot( mat.index, mat["Obs/Exp"],color=colors[2],label="Obs/Exp") + #ax2.set_ylabel("Obs/Exp") + #for t in ax2.get_yticklabels(): + # t.set_color(colors[2]) + pylab.savefig("%s.pdf" % (f.replace(".txt", ""))) + + +def main(): + op = help() + if op.chroms == "": + chroms = [] + else: + chroms = set(op.chroms.split(",")) + fs = glob("%s/*.ixy" % op.dir) + nfs = [] + for f in fs: + n = f.split("/")[-1].split(".ixy")[0].split("-") + if n[0] != n[1]: + continue + if len(chroms) == 0: + nfs.append(f) + if len(chroms) > 0 and n[0] in chroms and n[1] in chroms: + nfs.append(f) + fs = nfs + + print("%s \t Getting the observed enrichment trend for %s." % + (datetime.now(), op.dir)) + ds = Parallel(n_jobs=op.cpu,backend="multiprocessing")( + delayed(preObs)(f, cut=op.cut, binSize=op.binSize) for f in tqdm(fs)) + ds = np.concatenate(ds) + nn = getSortBins(ds) + del ds + + print("%s \t Getting the expected enrichment trend for %s." % + (datetime.now(), op.dir)) + for i in tqdm(range(op.repeats)): + ds = Parallel(n_jobs=op.cpu,backend="multiprocessing")( + delayed(preExp)(f, cut=op.cut, binSize=op.binSize) for f in fs) + ds = np.concatenate(ds) + nni = getSortBins(ds) + del ds + if i == 0: + nnp = nni + else: + nnp += nni + nnp = nnp / op.repeats + + ds = pd.DataFrame({"observed": nn, "expected": nni}) + #ds["Obs/Exp"] = ds["observed"]/ds["expected"] + ds.to_csv("%s_sigEnrich.txt" % op.output, sep="\t", index_label="bins") + + if op.plot: + plotObsExpSigEnrichment(op.output + "_sigEnrich.txt") + + +if __name__ == "__main__": + main() diff --git a/build/scripts-3.10/hicpro2bedpe.py b/build/scripts-3.10/hicpro2bedpe.py new file mode 100755 index 00000000..f3014ce8 --- /dev/null +++ b/build/scripts-3.10/hicpro2bedpe.py @@ -0,0 +1,105 @@ +#!/home/caoy7/anaconda3/envs/astroBoy/bin/python +#--coding:utf-8-- +""" +hicpro2bedpe.py +Convert HiC-Pro output validate pairs to cLoops input bedpe file. +""" +#sys +import os +import re +import sys +import gzip +import argparse +from glob import glob +from datetime import datetime +from argparse import RawTextHelpFormatter + +#3rd + +#cLoops2 +from cLoops2.utils import cFlush + + +def help(): + description = """ + Convert HiC-Pro allValidParis to BEDPE file as input of cLoops2. + Example: + hicpro2bedpe.py -f test.allValidPairs -o test + """ + + parser = argparse.ArgumentParser(description=description, + formatter_class=RawTextHelpFormatter) + parser.add_argument( + '-f', + dest="fin", + required=True, + type=str, + help="HiC-Pro allValidPairs file, can be .allValidParis.gz. ") + parser.add_argument('-o', + dest="out", + required=True, + type=str, + help="Output file prefix.") + parser.add_argument('-ext', + dest="ext", + required=False, + type=int, + default=50, + help="Extension from center of read, default is 50. ") + op = parser.parse_args() + return op + + +def pairs2bedpe(f_hicpro, f_out,ext=50): + """ + Converting HiC-Pro output allValidPairs to bedpe file. + """ + with gzip.open(f_out, 'wt') as f_bedpe: + if f_hicpro.endswith('.gz'): + #f_pair = gzip.open(f_hicpro) #python2 + f_pair = gzip.open(f_hicpro, 'rt') #python3 + else: + f_pair = open(f_hicpro) + for i, line in enumerate(f_pair): + if i % 100000 == 0: + cFlush("%s PETs processed from %s" % (i, f_hicpro)) + line = line.strip().split('\t') + #if the position is middle of reads + #petA = [line[1], int(line[2])-ext, int(line[2])+ext] + #petB = [line[4], int(line[5])-ext, int(line[5])+ext] + #if the position is 5 end of reads + if line[3] == "+": + petA = [line[1], int(line[2]), int(line[2])+ext] + else: + petA = [line[1], int(line[2])-ext, int(line[2])] + if line[6] == "+": + petB = [line[4], int(line[5]), int(line[5])+ext] + else: + petB = [line[4], int(line[5])-ext, int(line[5])] + + newline = [ + petA[0], petA[1], petA[2], petB[0], petB[1], petB[2], line[0], + '.', line[3], line[6] + ] + f_bedpe.write("\t".join(map(str, newline)) + "\n") + f_pair.close() + + +def main(): + op = help() + if not os.path.isfile(op.fin): + sys.stderr.write("Error: input file %s not exists.\n" % op.fin) + return + bedpe_file = re.sub(r'_allValidPairs(.gz)?$', '', op.out) + bedpe_file = bedpe_file + '.bedpe.gz' + if os.path.isfile(op.out): + sys.stderr.write("Error: output file %s exists.\n" % bedpe_file) + return + pairs2bedpe(op.fin, bedpe_file,ext=op.ext) + + +if __name__ == '__main__': + start_time = datetime.now() + main() + usedtime = datetime.now() - start_time + sys.stderr.write("Process finished. Used CPU time: %s Bye!\n" % usedtime) diff --git a/build/scripts-3.10/juicerLong2bedpe.py b/build/scripts-3.10/juicerLong2bedpe.py new file mode 100755 index 00000000..6c148325 --- /dev/null +++ b/build/scripts-3.10/juicerLong2bedpe.py @@ -0,0 +1,75 @@ +#!/usr/bin/env pyhthon +#--coding:utf-8-- +""" +""" + +import argparse, gzip, os, sys +from datetime import datetime +from argparse import RawTextHelpFormatter + + + +def help(): + description = """ + Convert Juicer long format file to to BEDPE file as input of cLoops2. + Example: + juicerLong2bedpe.py -f test.allValidPairs -o test + """ + parser = argparse.ArgumentParser(description=description, + formatter_class=RawTextHelpFormatter) + parser.add_argument('-f', + dest="fin", + type=str, + help="Input file name, required.") + parser.add_argument('-o', + dest="fout", + required=True, + type=str, + help="Output file name, required.") + op = parser.parse_args() + return op + + + +def long2bedpe(fin, fout, ext=50): + with open(fout, "w") as fo: + for line in open(fin): + if line.startswith("#"): + continue + line = line.split("\n")[0].split() + nline = [ + line[1], + max(0, + int(line[2]) - ext), + int(line[2]) + ext, #pet 1 + line[5], + max(0, + int(line[6]) - ext), + int(line[6]) + ext, #pet 2 + ".", + ".", + "+", + "+" #other infor + ] + if line[0] != "0": + nline[-2] = "-" + if line[4] != "0": + nline[-1] = "-" + fo.write("\t".join(list(map(str, nline))) + "\n") + + + +def main(): + op = help() + if not os.path.isfile(op.fin): + sys.stderr.write("Error: input file %s not exists!\n" % op.fin) + if os.path.isfile(op.fout): + sys.stderr.write("Error: output file %s exists! \n" % op.fout) + long2bedpe(op.fin, op.fout) + + +if __name__ == '__main__': + start_time = datetime.now() + main() + usedtime = datetime.now() - start_time + sys.stderr.write("Process finished. Used CPU time: %s Bye!\n" % usedtime) diff --git a/build/scripts-3.10/mergeDomains.py b/build/scripts-3.10/mergeDomains.py new file mode 100755 index 00000000..135fd9b6 --- /dev/null +++ b/build/scripts-3.10/mergeDomains.py @@ -0,0 +1,162 @@ +#!/home/caoy7/anaconda3/envs/astroBoy/bin/python +#--coding:utf-8 -- +""" +Merge domains from multiple results of different resolutions. +""" + +#sys +import os +import sys +import argparse +from datetime import datetime +from argparse import RawTextHelpFormatter + +#3rd library + +#cLoops2 +from cLoops2.ds import Domain +from cLoops2.io import doms2bed + + + +def help(): + """ + Create the command line interface for the script. + """ + description = """ + Merge domains from multiple resolutions. + + Example: + mergeDomains.py -fs 5k_tad.bed,10k_tad.bed,25k_tad.bed -o all -r 0.9 + """ + parser = argparse.ArgumentParser(description=description, + formatter_class=RawTextHelpFormatter) + parser.add_argument( + "-fs", + dest="fins", + required=True, + type=str, + help= + "The domains in BED format from differnet resolutions to be merged.\n"\ + "BED files should be input with decreased resolutions, for example\n"\ + "5k.bed,10k.bed,25k.bed. Multiple files seperated by comma." + ) + parser.add_argument( + '-r', + dest="lrcut", + required=False, + type=float, + default=0.9, + help="Ratio of overlapped domains to be merged. Default is 0.9." + ) + parser.add_argument( + '-o', + dest="fout", + required=True, + type=str, + help="Output file name, required." + ) + op = parser.parse_args() + return op + + + +def compDoms(doma,domb,lrcut=0.9): + """ + Compare if is quite close same domains. + If quite close, whether use doma to replace domb. + """ + if doma.chrom != domb.chrom: + return False + #overlapped domains + if domb.start <= doma.start <= domb.end or domb.start <= doma.end <= domb.end or doma.start <= domb.start <= doma.end or doma.start <= domb.end <= doma.end: + start = max(doma.start,domb.start) + end = min(doma.end,domb.end) + length = max(doma.length,domb.length) + if (end-start)/length > lrcut: + return True + return False + + +def combineDoms(doms, doms2, lrcut=0.9): + """ + Combine domains. + """ + #doms binsize is bigger than doms2 + for key in doms2.keys(): + if key not in doms: + doms[key] = doms2[key] + else: + #add no overlapped + for doma in doms2[key]: + flag = False + for i, domb in enumerate(doms[key]): + flag2 = compDoms(doma,domb,lrcut) + #if highly overlapped and similar, use the higher resolution/smaller domain to replace biggger one + if flag2: + flag = True + doms[key][i] = doma #replace + break + else: + continue + #no overlapped or almost the same + if flag == False: + doms[key].append(doma) + return doms + + +def readDoms(f): + doms = {} + for line in open(f): + line = line.split("\n")[0].split("\t") + if len(line) == 3: + did = "|".join(line) + if len(line) >=4: + if line[4].strip() == "": + did = "|".join(line) + else: + did = line[4] + chrom = line[0] + start = int(line[1]) + end = int(line[2]) + dom = Domain() + dom.id = did + dom.chrom = chrom + dom.start = start + dom.end = end + dom.length = end - start + if chrom not in doms: + doms[chrom] = [] + doms[chrom].append( dom ) + return doms + + + +def main(): + op = help() + + fs = op.fins.split(",") + for f in fs: + if not os.path.isfile(f): + print("%s not exists, return!"%f) + + doms = readDoms( fs[0] ) + for f in fs[1:]: + doms = combineDoms( doms, readDoms(f), lrcut=op.lrcut) + + with open(op.fout+"_mergedDomains.bed","w") as fo: + for c,v in doms.items(): + for dom in v: + line = list( map(str, [dom.chrom, dom.start, dom.end, dom.id]) ) + fo.write( "\t".join(line) + "\n") + with open(op.fout+"_mergedDomains_juicebox.bedpe","w") as fo: + head = "chr1\tx1\tx2\tchr2\ty1\ty2\tname\n" + fo.write(head) + for c,v in doms.items(): + for dom in v: + line = list( map(str, [dom.chrom, dom.start, dom.end,dom.chrom,dom.start,dom.end, dom.id]) ) + fo.write( "\t".join(line) + "\n") + + +if __name__ == "__main__": + main() diff --git a/build/scripts-3.10/plotDiffHeatmap.py b/build/scripts-3.10/plotDiffHeatmap.py new file mode 100755 index 00000000..263ace2b --- /dev/null +++ b/build/scripts-3.10/plotDiffHeatmap.py @@ -0,0 +1,259 @@ +#!/home/caoy7/anaconda3/envs/astroBoy/bin/python +#--coding:utf-8 -- +""" +""" +__date__ = "2021-02-10" +__modified__ = "" +__email__ = "caoyaqiang0410@gmail.com" + +#general library +import os +import sys +import json +import argparse +from glob import glob +from datetime import datetime +from argparse import RawTextHelpFormatter + +#3rd library +import numpy as np +import pandas as pd +from tqdm import tqdm +from scipy import sparse +from joblib import Parallel, delayed +from matplotlib.ticker import AutoLocator + +#cLoops2 +from cLoops2.io import parseIxy +from cLoops2.cmat import getObsMat, getExpMat +from cLoops2.settings import * + + +def help(): + """ + Create the command line interface. + """ + description = """ + Plot the difference of matrix heatmaps for the 3D genome data for two sets. + Example: + plotDiffHeatmap.py -fa Trac1/chr21-chr21.ixy -fb Trac2/chr21-chr21.ixy -o Trac1vs2 -cut 1000 -bs 2000 -log + """ + parser = argparse.ArgumentParser(description=description, + formatter_class=RawTextHelpFormatter) + parser.add_argument( + "-fa", + dest="faixy", + required=True, + type=str, + help="Input .ixy file generated by cLoops2 for first file.") + parser.add_argument( + "-fb", + dest="fbixy", + required=True, + type=str, + help="Input .ixy file generated by cLoops2 for second file.") + parser.add_argument("-o", + dest="output", + required=True, + type=str, + help="Output prefix.") + parser.add_argument( + "-bs", + dest="binSize", + required=False, + default=5000, + type=int, + help= + "Bin size/matrix resolution (bp) to generate the contact matrix for estimation, default is 5000 bp." + ) + parser.add_argument( + "-start", + dest="start", + required=False, + type=int, + default=0, + help="Start genomic coordinate for the target region,default is 0.") + parser.add_argument( + "-end", + dest="end", + required=False, + type=int, + default=-1, + help= + "End genomic coordinate for the target region,default will be inferred from the data." + ) + parser.add_argument("-log", + dest="log", + required=False, + action="store_true", + default=False, + help="Whether to log transform the matrix.") + parser.add_argument( + "-cut", + dest="cut", + type=int, + default=0, + help="PETs with distance > cut will be kept, default is 0.") + parser.add_argument( + "-mcut", + dest="mcut", + type=int, + default=-1, + help="PETs with distance < mcut will be kept, default is -1 no limit.") + parser.add_argument( + "-na", + dest="na", + type=str, + default="", + help="Sample A name, default is inferred from data directory name.") + parser.add_argument( + "-nb", + dest="nb", + type=str, + default="", + help="Sample B name, default is inferred from data directory name.") + parser.add_argument( + "-vmin", + dest="vmin", + type=float, + default=None, + help="The minimum value shown in the heatmap and colorbar.") + parser.add_argument( + "-vmax", + dest="vmax", + type=float, + default=None, + help="The maxmum value shown in the heatmap and colorbar.") + + op = parser.parse_args() + return op + + +def getData(f, cut=0, mcut=-1,start=0, end=-1): + """ + """ + chrom, xy = parseIxy(f, cut=cut, mcut=mcut) + if start == 0: + start = np.min(xy) + if end == -1: + end = np.max(xy) + ps = np.where((xy[:, 0] >= start) & (xy[:, 1] <= end))[0] + xy = xy[ps, ] + n = os.path.split(f)[-2] + p = os.path.abspath(f) + p = os.path.dirname(p) + metaf = os.path.join(p, "petMeta.json") + meta = json.loads(open(metaf).read()) + tot = meta["Unique PETs"] + return n, chrom, xy, tot + + +def plotDiffMatHeatmap( + fa, + fb, + fo, + start=0, + end=-1, + r=5000, + cut=0, + mcut=-1, + na="", + nb="", + log=False, + vmin=None, + vmax=None, +): + """ + Plot the contact matrix heatmaps for compare. + """ + labela, chroma, xya, tota = getData(fa, cut, mcut,start,end) + labelb, chromb, xyb, totb = getData(fb, cut, mcut,start,end) + if chroma != chromb: + print("ERROR! %s and %s are not the same target chromosome, return." % + (fa, fb)) + return + if start == 0: + start = min(np.min(xya), np.min(xyb)) + if end == -1: + end = max(np.max(xya), np.max(xyb)) + if na == "": + na = labela + if nb == "": + nb = labelb + mata = getObsMat(xya, start, end, r) + matb = getObsMat(xyb, start, end, r) + sf = tota / totb + mata = mata / sf + if log: + mat = np.log2((mata + 1) / (matb + 1)) + label = "log2( %s/%s )" % (na, nb) + else: + mat = mata - matb + label = "%s-%s" % (na, nb) + + hights = 4 + hr = [6, 0.1] + fig = pylab.figure(figsize=(4, hights)) + gs = mpl.gridspec.GridSpec(len(hr), + 1, + height_ratios=hr, + top=0.95, + bottom=0.05, + left=0.1, + right=0.9, + wspace=0.05) + pylab.suptitle("%s-%s, %s:%s-%s" % (na, nb, chroma[0], start, end), + fontsize=8) + cmap = sns.color_palette("RdBu_r", 11).as_hex() + cmap[int(len(cmap) / 2)] = "#FFFFFF" + cmap = ListedColormap(cmap) + ax = fig.add_subplot(gs[-2]) + cax = fig.add_subplot(gs[-1]) + sns.set(font_scale=0.5) + ax = sns.heatmap(mat, + xticklabels=False, + yticklabels=False, + linewidths=0.0, + square=True, + cmap=cmap, + ax=ax, + center=0, + vmin=vmin, + vmax=vmax, + cbar_ax=cax, + cbar_kws={ + 'label': label, + 'orientation': 'horizontal', + "shrink": 0.5, + "fraction": 0.2, + "anchor": (0.0, 1.0) + }) + cax.tick_params(labelsize=4) + #draw the box + ax.axvline(x=ax.get_xlim()[0], color="k", linewidth=2) + ax.axvline(x=ax.get_xlim()[1], color="k", linewidth=2) + ax.axhline(y=ax.get_ylim()[0], color="k", linewidth=2) + ax.axhline(y=ax.get_ylim()[1], color="k", linewidth=2) + pylab.savefig(fo + "_compareMatrix.pdf") + + +def main(): + op = help() + plotDiffMatHeatmap( + op.faixy, + op.fbixy, + op.output, + start=op.start, + end=op.end, + r=op.binSize, + cut=op.cut, + log=op.log, + na=op.na, + nb=op.nb, + vmin=op.vmin, + vmax=op.vmax, + ) + + +if __name__ == "__main__": + main() diff --git a/build/scripts-3.10/tow.py b/build/scripts-3.10/tow.py new file mode 100755 index 00000000..906d3eda --- /dev/null +++ b/build/scripts-3.10/tow.py @@ -0,0 +1,645 @@ +#!/home/caoy7/anaconda3/envs/astroBoy/bin/python +#--coding:utf-8-- +""" +tow.py +cLoops2 tow algorithm for normalizing ChIP-seq data to reference sample at bp level. +tmp settings: only remove background noise + +Notes +1. log the data, much better, for unfixed size region +2. for bp mean level RPM ,log not needed +3. should also consider different signal to noise ratio, for peak region +4: two level of normalization: a. normalize to ChIP itself background; b. normalize with scaling +5: trin GMM model for signals around peaks can have weights of signal and noise, however, the weights are not stable due to the regions selected around peaks, so it is not good choose (signal - wnoise*noise)/wsignal as correction, can be tested + +Main function: +1. find background regions +2. qc for signal/noise ratio and noise level +3. correct the signal based on noise in one sample +4. correct the signal based on noise ratio for treatment sample to reference sample +5. correct the signal regions for treatment sample +""" + +#sys library +import time +import sys +import os +import argparse +from glob import glob +from copy import deepcopy +from datetime import datetime +from argparse import RawTextHelpFormatter + +#3rd library +import pyBigWig +import numpy as np +import pandas as pd +from tqdm import tqdm +from joblib import Parallel, delayed +from sklearn.mixture import GaussianMixture as GMM + +#cLoops2 +from cLoops2.cmat import getBinMean +from cLoops2.utils import getLogger +from cLoops2.settings import * + +#global settings +#logger +date = time.strftime(' %Y-%m-%d', time.localtime(time.time())) +logger = getLogger(fn=os.getcwd() + "/" + date.strip() + "_" + + os.path.basename(__file__) + ".log") + + +def help(): + """ + Create the command line interface for the script. + """ + description = """ + Normalize target sample ChIP-seq data signal to reference sample at bp + level with assumptions of: 1) background noise signal level should be + similar and normalized to 0; 2) there are some regions (shared peaks) + which share similar level of signal intensities. + + Input bigWigs file should be normalized as RPM (reads per million) first. + + The normalization may not suitable for the TF ChIP-seq WT vs KO sample. + + Example: + tow.py -br ref.bed -bt tgt.bed -wr ref.bw -wt tgt.bw -o test + """ + parser = argparse.ArgumentParser(description=description, + formatter_class=RawTextHelpFormatter) + parser.add_argument( + "-br", + dest="refBed", + required=True, + type=str, + help= + "The peaks for reference sample in BED format..\n"\ + ) + parser.add_argument( + "-bt", + dest="tgtBed", + required=True, + type=str, + help= + "The peaks for target sample.\n"\ + ) + parser.add_argument("-wr", + dest="refBw", + required=True, + type=str, + help="The bigWig file for reference sample.") + parser.add_argument("-wt", + dest="tgtBw", + required=True, + type=str, + help="The bigWig file for target sample.") + parser.add_argument("-o", + dest="fnOut", + required=True, + type=str, + help="Output prefix.") + parser.add_argument( + "-ext", + dest="ext", + required=False, + type=int, + default=10000, + help= + "Extension size from peak center to build Gaussian Mixture Model for\n"\ + "classification of background and signal region. Default is 10000bp.\n"\ + "For board peaks such as H3K27me3, should increase the parameter to\n"\ + "include enough nearby regions, such as 50000." + ) + parser.add_argument("-labelr", + dest="refLabel", + required=False, + type=str, + default="ref", + help="The label for reference sample. Default is ref.") + parser.add_argument("-labelt", + dest="tgtLabel", + required=False, + type=str, + default="tgt", + help="The label for target sample. Default is tgt.") + op = parser.parse_args() + return op + + +def readBed(f): + rs = [] + for line in open(f): + line = line.split("\n")[0].split("\t") + if len(line) < 3: + continue + try: + line[1] = int(line[1]) + line[2] = int(line[2]) + except: + continue + rs.append(line[:3]) + return rs + + +def buildCov(rs): + """ + Genomic region coverage + """ + lims = {} + cov = {} + for r in rs: + #coverages + if r[0] not in cov: + cov[r[0]] = set() + cov[r[0]].update(range(r[1], r[2] + 1)) + #range limitations + if r[0] not in lims: + lims[r[0]] = [r[1], r[2]] + if r[1] < lims[r[0]][0]: + lims[r[0]][0] = r[1] + if r[2] > lims[r[0]][1]: + lims[r[0]][1] = r[2] + return cov, lims + + +def getRegion(cov, margin=1): + """ + Get regions from coverage + """ + rs = [] + for c, s in cov.items(): + s = list(s) + s.sort() + i = 0 + while i < len(s) - 1: + for j in range(i + 1, len(s)): + if s[j] - s[j - 1] > margin: + break + else: + continue + start = s[i] + end = s[j - 1] + i = j #update search start + rs.append([c, start, end]) + return rs + + +def checkBgOverlaps(c, s, e, cov, lims): + if s < lims[c][0] or e > lims[c][1]: + return True + for i in range(s, e + 1): + if i in cov[c]: + return True + return False + + +def getFgBgs(refPeaks, tgtPeaks, ext=10): + """ + Get the background regions. + #coverage, require no overlaps for background regions + """ + refCov, refLims = buildCov(refPeaks) + tgtCov, tgtLims = buildCov(tgtPeaks) + coCov, allCov, lims = {}, {}, {} + for c in refCov.keys(): + minv = min(refLims[c][0], tgtLims[c][0]) + maxv = max(refLims[c][1], tgtLims[c][1]) + lims[c] = [minv, maxv] + a = deepcopy(refCov[c]) + a.update(deepcopy(tgtCov[c])) + allCov[c] = a + s = refCov[c].intersection(tgtCov[c]) + coCov[c] = s + fgs = getRegion(coCov) #shared peaks regions + alls = getRegion(allCov) #all peak regions + bgs = [] + for r in alls: + d = r[2] - r[1] + c = r[0] + s = r[1] - ext * d + e = r[2] - ext * d + if checkBgOverlaps(c, s, e, allCov, lims): + continue + bgs.append([c, s, e]) + s = r[1] + ext * d + e = r[2] + ext * d + if checkBgOverlaps(c, s, e, allCov, lims): + continue + bgs.append([c, s, e]) + return fgs, bgs + + +def getBwSig(rs, f, bins=100): + """ + Get region signal from bigWig file. + """ + bw = pyBigWig.open(f) + s = [] + for r in rs: + try: + ns = bw.values(r[0], r[1], r[2]) + except: + continue + ns = np.nan_to_num(ns) + if len(ns) < bins: + continue + ns = getBinMean(ns, bins) + s.append(ns) + return np.array(s) + + +def getQc(fgs, + bgs, + refBw, + tgtBw, + fnOut, + refLabel="ref", + tgtLabel="tgt", + bins=100): + """ + Quality control: + 1. noise compare + 2. signal to noise ratio compare + """ + fgRef = getBwSig(fgs, refBw, bins=bins) + fgRef = fgRef.mean(axis=0) + fgTgt = getBwSig(fgs, tgtBw, bins=bins) + fgTgt = fgTgt.mean(axis=0) + bgRef = getBwSig(bgs, refBw, bins=bins) + bgRef = bgRef.mean(axis=0) + bgTgt = getBwSig(bgs, tgtBw, bins=bins) + bgTgt = bgTgt.mean(axis=0) + bgfc = bgRef.mean() / bgTgt.mean() + fgfc = fgRef.mean() / fgTgt.mean() + fig, axs = pylab.subplots(1, 3, figsize=(6, 2), sharex=True) + axs = axs.reshape(-1) + x = np.arange(bins) + ax = axs[0] + ax.plot(x, bgRef, label=refLabel) + ax.plot(x, bgTgt, label=tgtLabel) + ax.set_xlabel("bins") + ax.set_ylabel("ChIP-seq mean signals, RPM") + ax.set_title("background region\nsf(%s->%s):%.3f" % + (tgtLabel, refLabel, bgfc)) + ax.legend() + #signal level + ax = axs[1] + ax.plot(x, fgRef, label=refLabel) + ax.plot(x, fgTgt, label=tgtLabel) + ax.set_ylabel("ChIP-seq mean signals, RPM") + ax.set_xlabel("bins") + ax.set_title("peak region\nsf(%s->%s):%.3f" % (tgtLabel, refLabel, fgfc)) + ax.legend() + #signal to noise ratio + refSN = fgRef / bgRef + tgtSN = fgTgt / bgTgt + ax = axs[2] + ax.plot(x, refSN, label=refLabel) + ax.plot(x, tgtSN, label=tgtLabel) + ax.set_xlabel("bins") + ax.set_ylabel("Signal to noise ratio") + ax.legend() + ax.set_title("Signal to noise") + pylab.tight_layout() + pylab.savefig(fnOut + "_1_qc.pdf") + return fgRef, fgTgt, bgRef, bgTgt + + +def estFit(bgs, + refBw, + tgtBw, + bgRef, + bgTgt, + sf, + fnOut, + refLabel="ref", + tgtLabel="tgt", + bins=2): + """ + Estimate linear fiting between samples. + """ + refS = getBwSig(bgs, refBw, bins=bins) + refS = pd.Series(refS.reshape(-1)) + tgtS = getBwSig(bgs, tgtBw, bins=bins) + tgtS = pd.Series(tgtS.reshape(-1)) + + fig, axs = pylab.subplots(1, 2, figsize=(5, 2)) + axs = axs.reshape(-1) + + #signal conversion + refS = refS - bgRef.mean() + tgtS = (tgtS - bgTgt.mean()) * sf + s = refS[refS > 0].index + s = tgtS[s] + s = s[s > 0].index + refS = refS[s] + tgtS = tgtS[s] + #log transformation + refS = np.log2(refS) + tgtS = np.log2(tgtS) + ax = axs[0] + sns.kdeplot(refS, label=refLabel, ax=ax, fill=True) + sns.kdeplot(tgtS, label=tgtLabel, ax=ax, fill=True) + ax.legend() + ax.set_xlabel("log2(RPM)") + ax.set_title("signal distribution") + + #distribution match + ax = axs[1] + #tgtSc = (tgtS - tgtS.mean())/tgtS.std()*refS.std() + refS.mean() + alpha = refS.std() / tgtS.std() + beta = refS.mean() - alpha * tgtS.mean() + tgtSc = tgtS * alpha + beta + + m = refS - tgtSc + a = (refS + tgtSc) / 2 + ax.scatter(a, m, s=0.1) + if beta > 0: + ax.set_title( + "after correction M~A PCC:%.3f\nlog2(%s)=%.3flog2(%s)+%.3f" % + (m.corr(a), refLabel, alpha, tgtLabel, beta)) + else: + ax.set_title( + "after correction M~A PCC:%.3f\nlog2(%s)=%.3flog2(%s)%.3f" % + (m.corr(a), refLabel, alpha, tgtLabel, beta)) + ax.set_xlabel("A, (log2(%s)+log2(%s))/2)" % (refLabel, tgtLabel)) + ax.set_ylabel("M, log2(%s)-log2(%s)" % (refLabel, tgtLabel)) + + pylab.tight_layout() + pylab.savefig(fnOut + "_2_fgSignalConversion.pdf") + return [alpha, beta] + + +def corrSig(ss, noise=None, sf=None, sf2=None, trim=False): + """ + ss: signal matrix + noise: value, random noise + sn: signal noise ratio + sf: scaling fitting, if none, do not scaling + """ + ns = [] + ss = pd.DataFrame(ss) + for t in ss.itertuples(): + t = np.array(t[1:]) + if noise is not None: + t = t - noise + if trim: + t[t < 0] = 0 + if sf is not None: + t = t * sf + if sf2 is not None: + t = 2**(np.log2(t) * sf2[0] + sf2[1]) + ns.append(t) + return pd.DataFrame(ns) + + +def checkCorrSig(rs, + bgs, + refBw, + tgtBw, + bgRef, + bgTgt, + sf, + sf2, + fnOut, + refLabel="ref", + tgtLabel="tgt", + bins=100): + """ + Correct signal. + """ + fgRef = corrSig(getBwSig(rs, refBw, bins=bins), bgRef, + trim=True).mean(axis=0) + bgRef = corrSig(getBwSig(bgs, refBw, bins=bins), bgRef, + trim=False).mean(axis=0) + + fgTgt = corrSig(getBwSig(rs, tgtBw, bins=bins), bgTgt, sf, sf2, + trim=True).mean(axis=0) + bgTgt = corrSig(getBwSig(bgs, tgtBw, bins=bins), bgTgt, sf, + trim=False).mean(axis=0) + + fig, axs = pylab.subplots(1, 2, figsize=(6, 3), sharex=True) + axs = axs.reshape(-1) + #noise level + x = np.arange(bins) + ax = axs[0] + ax.plot(x, bgRef, label=refLabel) + ax.plot(x, bgTgt, label=tgtLabel) + ax.set_ylim([-0.001, 0.001]) + ax.set_xlabel("bins") + ax.set_ylabel("ChIP-seq mean signals, RPM") + ax.set_title("background region") + ax.legend() + #signal level + ax = axs[1] + ax.plot(x, fgRef, label=refLabel) + ax.plot(x, fgTgt, label=tgtLabel) + ax.set_ylabel("ChIP-seq mean signals, RPM") + ax.set_xlabel("bins") + ax.set_title("peak region") + ax.legend() + pylab.tight_layout() + pylab.savefig(fnOut + "_3_CorrectSignalQc.pdf") + + +def getGmm(fg, bg, bw, ax, title, ext=5000, bins=5): + """ + Train one GMM. Data should log2 first + """ + rs = [] + for r in fg: + center = int((r[1] + r[2]) / 2) + nr = [r[0], center - ext, center + ext] + rs.append(nr) + #foreground signal + fgS = getBwSig(fg, bw, bins=bins) + fgS = fgS.reshape(-1) + fgS = np.log2(fgS[fgS > 0]) + #background signal + bgS = getBwSig(bg, bw, bins=bins) + bgS = bgS.reshape(-1) + bgS = np.log2(bgS[bgS > 0]) + #peak nearby region + mixS = getBwSig(rs, bw, bins=bins) + mixS = mixS.reshape(-1) + mixS = np.log2(mixS[mixS > 0]) + #plot + sns.kdeplot(bgS, label="background", fill=False, ax=ax) + sns.kdeplot(fgS, label="peaks", fill=False, ax=ax) + sns.kdeplot(mixS, label="peak and nearby", fill=False, ax=ax) + #train gmm + gmm = GMM(n_components=2, + covariance_type="full", + random_state=123, + means_init=[[np.mean(fgS)], + [np.mean(bgS)]]).fit([[v] for v in mixS]) + ms = gmm.means_.reshape(-1) + ws = gmm.weights_ + ax.legend() + ax.set_xlabel("log2(RPM)") + ax.set_title(title + "\n" + "means:%.3f, %.3f\nweights:%.3f, %.3f" % + (ms[0], ms[1], ws[0], ws[1])) + #correct gmm predict targets, 0 as noise , 1 as signal + if ms[0] > ms[1]: + cs = {0: 1, 1: 0} + else: + cs = {0: 0, 1: 1} + return gmm, cs + + +def trainGmm(refPeaks, + tgtPeaks, + bgs, + refBw, + tgtBw, + fnOut, + refLabel="ref", + tgtLabel="tgt", + bins=5): + """ + Train GMM to classify background regions or signal regions + """ + fig, axs = pylab.subplots(1, 2, figsize=(6, 3), sharex=True, sharey=True) + axs = axs.reshape(-1) + + refGmm, refCs = getGmm(refPeaks, bgs, refBw, axs[0], refLabel) + tgtGmm, tgtCs = getGmm(tgtPeaks, bgs, tgtBw, axs[1], tgtLabel) + + pylab.tight_layout() + pylab.savefig(fnOut + "_4_GMM.pdf") + return refGmm, refCs, tgtGmm, tgtCs + + +def normRefBw(bw, gmm, gmmCs, noise, fnOut): + """ + Normalize reference bigWig files. Only remove background noise. + """ + bwi = pyBigWig.open(bw) + with open(fnOut + ".bdg", "w") as fo: + for chrom, size in bwi.chroms().items(): + #get the singal for whole chromosome + ss = bwi.intervals(chrom) + for s in ss: + v = s[-1] + if v == 0: + continue + #gmm was trained with log2 data + t = gmmCs[gmm.predict([[np.log2(v)]])[0]] + if t == 0: + continue + v = v - noise + if v < 0: + continue + line = [chrom, s[0], s[1], "%.5f" % v] + fo.write("\t".join(list(map(str, line))) + "\n") + bwi.close() + + +def normTgtBw(bw, gmm, gmmCs, noise, sf, sf2, fnOut): + """ + Normalize target sample bigWig file. + """ + bwi = pyBigWig.open(bw) + with open(fnOut + ".bdg", "w") as fo: + for chrom, size in bwi.chroms().items(): + #add header + ss = bwi.intervals(chrom) + for s in ss: + v = s[-1] + if v == 0: + continue + #gmm was trained with log2 data + t = gmmCs[gmm.predict([[np.log2(v)]])[0]] + if t == 0: + continue + v = v - noise + if v < 0: + continue + #first scaling factor, obtained with not log2 data + v = v * sf + if t == 1: #signal, further normalization + v = 2**(np.log2(v) * sf2[0] + sf2[1]) + line = [chrom, s[0], s[1], "%.5f" % v] + fo.write("\t".join(list(map(str, line))) + "\n") + bwi.close() + + +def main(): + op = help() + + #step 0 parameters check + report = "python tow.py -br {refBed} -bt {tgtBed} -wr {refBw} -wt {tgtBw} -o {fnOut} -ext {ext} -labelr {refLabel} -labelt {tgtLabel}".format( + refBed=op.refBed, + tgtBed=op.tgtBed, + refBw=op.refBw, + tgtBw=op.tgtBw, + fnOut=op.fnOut, + ext=op.ext, + refLabel=op.refLabel, + tgtLabel=op.tgtLabel) + logger.info(report) + + #step 1 read bed + refPeaks = readBed(op.refBed) + tgtPeaks = readBed(op.tgtBed) + + #step 2 generate background regions + fgs, bgs = getFgBgs(refPeaks, tgtPeaks) + logger.info( + "[%s] Step0: ref sample peaks: %s; tgt sample peaks: %s; shared: %s; background regions: %s" + % (op.fnOut, len(refPeaks), len(tgtPeaks), len(fgs), len(bgs))) + + #step 3 qc for signal to noise ratio and noise level + logger.info( + "[%s] Step1: Initial QC for background noise level and signal-to-noise ratio." + % (op.fnOut)) + fgRef, fgTgt, bgRef, bgTgt = getQc(fgs, + bgs, + op.refBw, + op.tgtBw, + op.fnOut, + refLabel=op.refLabel, + tgtLabel=op.tgtLabel) + #scaling factor for background region + sf = bgRef.mean() / bgTgt.mean() + + #step 4 estimate sample-wise fitting + #scaling factor for singla region + logger.info("[%s] Step2: Estimating scaling factor for signal regions." % + (op.fnOut)) + sf2 = estFit(fgs, op.refBw, op.tgtBw, bgRef, bgTgt, sf, op.fnOut, + op.refLabel, op.tgtLabel) + + #step 5 check correction signal + logger.info("[%s] Step3: Checking corrected signal." % (op.fnOut)) + checkCorrSig(fgs, bgs, op.refBw, op.tgtBw, bgRef, bgTgt, sf, sf2, op.fnOut, + op.refLabel, op.tgtLabel) + + #step 6 train GMM with fg and bg data for classifiy fg and bg regions + logger.info( + "[%s] Step4: Building Gaussian Mixture Model for classfication of background and signal regions" + % (op.fnOut)) + refGmm, refCs, tgtGmm, tgtCs = trainGmm(refPeaks,tgtPeaks, bgs, op.refBw, op.tgtBw, + op.fnOut, op.refLabel, op.tgtLabel) + + #step 7 performe normalization to bigWig files + noiseRef = bgRef.mean() + noiseTgt = bgTgt.mean() + logger.info( + "[%s] Step5: Normaling reference sample with background noise." % + (op.fnOut)) + normRefBw(op.refBw, refGmm, refCs, noiseRef, op.fnOut + "_" + op.refLabel) + logger.info("[%s] Step6: Normaling target sample." % (op.fnOut)) + normTgtBw(op.tgtBw, tgtGmm, tgtCs, noiseTgt, sf, sf2, + op.fnOut + "_" + op.tgtLabel) + + logger.info("[%s] analysis finished." % (op.fnOut)) + + +if __name__ == "__main__": + start_time = datetime.now() + main() + usedtime = datetime.now() - start_time + sys.stderr.write("Process finished. Used CPU time: %s Bye!\n" % usedtime) diff --git a/build/scripts-3.10/tracPre.py b/build/scripts-3.10/tracPre.py new file mode 100755 index 00000000..54b40bb7 --- /dev/null +++ b/build/scripts-3.10/tracPre.py @@ -0,0 +1,314 @@ +#!/home/caoy7/anaconda3/envs/astroBoy/bin/python +#--coding:utf-8-- +""" +tracPre.py +Pre-processing code for Trac-looping data, implemented with cLoops2, from fastq to bedpe files and qc report. +2020-02-27: finished and well tested. +""" + +__author__ = "CAO Yaqiang" +__email__ = "caoyaqiang0410@gmail.com" + +#systematic library +import os +import time +import gzip +import argparse +import subprocess +from glob import glob +from datetime import datetime +from argparse import RawTextHelpFormatter + +#3rd library +import pandas as pd +from joblib import Parallel, delayed + +#cLoops2 +from cLoops2.utils import getLogger, callSys, isTool + +#global settings +#logger +date = time.strftime(' %Y-%m-%d', time.localtime(time.time())) +logger = getLogger(fn=os.getcwd() + "/" + date.strip() + "_" + + os.path.basename(__file__) + ".log") + + +def help(): + """ + Create the command line interface for the script. + """ + description = """ + Preprocess the raw reads of FASTQ files of Trac-looping to reference + geneome with bowtie2 and obtain the unqiue PETs with quality control + results. + Fastqs files should be named with suffix pattern as + _R1.fastq.gz, _R2.fastq.gz. + + Example: + tracPre.py -fqd ../1.fq -o ./ -ref ../bowtie2/hg38 -n 10 -p 5 -mapq 10 + """ + parser = argparse.ArgumentParser(description=description, + formatter_class=RawTextHelpFormatter) + parser.add_argument( + "-fqd", + dest="fqd", + required=True, + type=str, + help="The directory for raw .fastq.gz files, for example ../1.fastq/ " + ) + parser.add_argument( + "-o", + dest="output", + required=False, + type=str, + default="./", + help= + "Output directory, default is ./, if directory not exists, create one." + ) + parser.add_argument( + "-ref", + dest="ref", + required=True, + type=str, + help= + "Bowtie2 reference index prefix, such as ./ref/hg38, generated from\n"\ + "bowtie2-build hg38.fa hg38." + ) + parser.add_argument( + "-n", + dest="number", + required=False, + type=int, + default=1, + help="How many Bowtie2 to run at the same time, default is 1. ") + parser.add_argument( + "-p", + dest="cpu", + required=False, + type=int, + default=5, + help="How many cpus used by each Bowtie2 or following processing,\n"\ + "default is 5. " + ) + parser.add_argument( + "-mapq", + dest="mapq", + required=False, + default=10, + type=int, + help="MAPQ cutoffs for filtering PETs, default is 10." + ) + op = parser.parse_args() + return op + + +def preFqs(fastqRoot): + """ + If the fastq files are well prepared, suitable. + """ + fastqs = glob(fastqRoot + "/*.fastq.gz") + data = {} + for fq in fastqs: + s = os.path.split(fq)[1] + s = s.replace(".fastq.gz", "") + if s.endswith("_R1"): + sample = s.replace("_R1", "") + if sample not in data: + data[sample] = [0, 0] + data[sample][0] = fq + if s.endswith("_R2"): + sample = s.replace("_R2", "") + if sample not in data: + data[sample] = [0, 0] + data[sample][1] = fq + for key, fqs in data.items(): + if len(fqs) != 2: + logger.error( + "for %s there is not paired fastq files, only %s found" % + (key, ",".join(fqs))) + del data[key] + return data + + +def tracMapping(sample, fqs, ref, outdir, cpus=25): + """ + Mapping settings for Trac-looping data. + """ + logger.info("Start mapping %s.\n" % sample) + od = os.path.join(outdir, sample) + if not os.path.exists(od): + os.makedirs(od, exist_ok=True) + sam = od + "/" + sample + ".sam" + bam = od + "/" + sample + ".bam" + if os.path.isfile(sam): + logger.error("%s:%s exists, return." % (sample, sam)) + return None + if os.path.isfile(bam): + logger.error("%s:%s exists, return." % (sample, bam)) + return None + doBowtie = "bowtie2 -p {cpus} -q --local --very-sensitive -x {ref} -1 {fq1} -2 {fq2} -S {sam}".format( + cpus=cpus, ref=ref, fq1=fqs[0], fq2=fqs[1], sam=sam) + logger.info(doBowtie) + stat, output = subprocess.getstatusoutput(doBowtie) + #trim with "Warning" + output = output.split("\n") + output = [t for t in output if not t.startswith("Warning")] + output = "\n".join(output) + logger.info("FLAG_A:" + sample + "\n" + output + "\nFLAG_A\n") + return sam + + +def getUniqueBedpe(f, fout): + """ + Get unique bedpe + """ + if os.path.isfile(fout): + return + print("Getting unique PETs from %s to %s" % (f, fout)) + redus = set() + #with gzip.open(fout, "wt") as fo: + with open(fout, "w") as fo: + #for i, line in enumerate(gzip.open(f, "rt")): + for i, line in enumerate(open(f)): + line = line.split("\n")[0].split("\t") + if len(line) < 6: + continue + #remove redudant PETs + r = hash(tuple(line[:6])) + if r in redus: + continue + else: + redus.add(r) + #shroten the name + #line[6] = str(i) + fo.write("\t".join(line) + "\n") + + +def sam2bamBedpe(sam, mapq=10): + """ + SAM to BAM and bedpe file + """ + n = os.path.splitext(sam)[0] + bam = n + ".bam" + bedpeAll = n + "_all.bedpe" + bedpeUni = n + "_unique.bedpe" + #sam to bam, filtering mapq + samview = "samtools view -b -F 4 -@ 2 -q {mapq} -o {bam} {sam}".format( + mapq=mapq, bam=bam, sam=sam) + #sort by read name + samsort = "samtools sort -n -@ 2 {bam} -T {pre} -o {bam}".format( + bam=bam, pre=bam.replace(".bam", "")) + rmsam = "rm %s" % (sam) + cmds = [samview, samsort, rmsam] + callSys(cmds, logger) + bam2bedpe = "bamToBed -bedpe -i {bam} > {bedpe}".format(bam=bam, + bedpe=bedpeAll) + logger.info(bam2bedpe) + stat, output = subprocess.getstatusoutput(bam2bedpe) + getUniqueBedpe(bedpeAll, bedpeUni) + cmd = "gzip %s %s" % (bedpeAll, bedpeUni) + callSys([cmd], logger) + return bedpeAll + ".gz" + + +def sParseBowtie(lines): + """ + Parse Bowtie2 log file, to obtain mapping stastics. + """ + d, s = None, None + lines = lines.split("\n") + s = lines[0] + totalReads = int(lines[1].split(";")[0].split()[0]) + d1 = lines[4].strip().split() + conUniqueMappedReads = int(d1[0]) + d2 = lines[8].strip().split() + unconUniqueMappedReads = int(d2[0]) + #mapRatio = float(lines[15].split("%")[0]) + mapRatio = float(lines[-2].split("%")[0]) + d = { + "TotalRawReads": totalReads, + #"ConcordantlyUniqueMapReads": conUniqueMappedReads, + #"DisconcordantlyUniqueMapReads": unconUniqueMappedReads, + "MappingRatio(%s)": mapRatio + #"MultipleMapReads": multipleMappedReads, + #"MultipleMapRatio": multipleMappedRatio, + } + return d, s + + +def parseBowtielog(logs=None): + if logs == None: + logs = glob("*.log") + data = {} + for log in logs: + lines = open(log).read().split("FLAG_A\n") + lines = [line for line in lines if "FLAG_A" in line] + for line in lines: + t = line.split("FLAG_A:")[1] + d, s = sParseBowtie(t) + data[s] = d + data = pd.DataFrame(data).T + return data + + +def main(): + """ + Batch converting from bam to bedpe. + """ + op = help() + for t in ["bowtie2", "samtools", "bamToBed"]: + if not isTool(t): + logger.error("%s not exits! Please install through conda." % t) + return + if not os.path.exists(op.fqd): + logger.error("Input %s not exists! Return." % op.fqd) + return + if len(glob(op.ref + "*.bt2")) == 0: + logger.error("Bowtie2 reference not exists for prefix of %s! Return." % + op.ref) + return + if not os.path.exists(op.output): + os.makedirs(op.output, exist_ok=True) + else: + fs = glob(os.path.join(op.output, "*")) + if len(fs) > 0: + logger.info( + "Target output directory %s is not empty, may over-write some files." + % op.output) + + #mapping + data = preFqs(op.fqd) + if len(data) == 0: + logger.error( + "No matched _R1.fastq.gz and _R2.fastq.gz in %s. Return." % + (op.fqd)) + return + ref = op.ref + sams = Parallel(n_jobs=op.number,backend="multiprocessing")( + delayed(tracMapping)(sample, fqs, ref, op.output, cpus=op.cpu) + for sample, fqs in data.items()) + sams = [sam for sam in sams if sam is not None] + + #sam to bam and bedpe + cpus = op.number * op.cpu + ncpus = int(min(len(sams), cpus / 2)) + bedpes = Parallel(n_jobs=ncpus,backend="multiprocessing")(delayed(sam2bamBedpe)(sam) for sam in sams) + + #cLoops2 qc + cmd = "cLoops2 qc -f %s -o bedpeQc -p %s" % (",".join(bedpes), + min(len(bedpes), cpus)) + callSys([cmd], logger) + + #combine report + mata = parseBowtielog() + matb = pd.read_csv("bedpeQc_bedpeQc.txt", index_col=0, sep="\t") + matb.index = [i.split("_all")[0] for i in matb.index] + for c in matb.columns: + mata[c] = matb[c] + mata.to_csv("tracPre_summary.txt", sep="\t") + cmd = "rm bedpeQc_bedpeQc.txt" + os.system(cmd) + + +if __name__ == '__main__': + main() diff --git a/build/scripts-3.10/tracPre2.py b/build/scripts-3.10/tracPre2.py new file mode 100755 index 00000000..49346b06 --- /dev/null +++ b/build/scripts-3.10/tracPre2.py @@ -0,0 +1,515 @@ +#!/home/caoy7/anaconda3/envs/astroBoy/bin/python +#--coding:utf-8-- +""" +tracPre.py +Pre-processing code for Hi-Trac data, implemented with cLoops2, from fastq to bedpe files and qc report. +2020-02-27: finished and well tested. +2020-06-30: add linker filter, new stat, and changing mapping to end-to-end +2023-09-11: summary report output prefix added +""" + +__author__ = "CAO Yaqiang" +__email__ = "caoyaqiang0410@gmail.com" + +#systematic library +import os +import time +import gzip +import argparse +import subprocess +from glob import glob +from datetime import datetime +from argparse import RawTextHelpFormatter + +#3rd library +import pandas as pd +from joblib import Parallel, delayed +from Bio.Seq import Seq +from Bio.SeqIO.QualityIO import FastqGeneralIterator + +#cLoops2 +from cLoops2.utils import getLogger, callSys, isTool + +#global settings +#logger +logger = None + + +def help(): + """ + Create the command line interface for the script. + """ + description = """ + Preprocess the raw reads of FASTQ files of Hi-TrAC to reference + genome with bowtie2 and obtain the unqiue PETs with quality control + results. + Fastqs files should be named with suffix pattern as + _R1.fastq.gz, _R2.fastq.gz. + + Example: + tracPre2.py -fqd ../1.fq -o ./ -ref ../bowtie2/hg38 -n 10 -p 5 -mapq 10 + """ + parser = argparse.ArgumentParser(description=description, + formatter_class=RawTextHelpFormatter) + parser.add_argument( + "-fqd", + dest="fqd", + required=True, + type=str, + help="The directory for raw .fastq.gz files, for example ../1.fastq/ ") + parser.add_argument( + "-o", + dest="output", + required=False, + type=str, + default="./", + help= + "Output directory, default is ./, if directory not exists, create one." + ) + parser.add_argument( + "-ref", + dest="ref", + required=True, + type=str, + help= + "Bowtie2 reference index prefix, such as ./ref/hg38, generated from\n"\ + "bowtie2-build hg38.fa hg38." + ) + parser.add_argument( + "-n", + dest="number", + required=False, + type=int, + default=1, + help="How many Bowtie2 to run at the same time, default is 1. ") + parser.add_argument( + "-p", + dest="cpu", + required=False, + type=int, + default=5, + help="How many cpus used by each Bowtie2 or following processing,\n"\ + "default is 5. " + ) + parser.add_argument("-mapq", + dest="mapq", + required=False, + default=10, + type=int, + help="MAPQ cutoffs for filtering PETs, default is 10.") + op = parser.parse_args() + return op + + +def preFqs(fastqRoot): + """ + If the fastq files are well prepared, suitable. + """ + fastqs = glob(fastqRoot + "/*.fastq.gz") + data = {} + for fq in fastqs: + s = os.path.split(fq)[1] + s = s.replace(".fastq.gz", "") + if s.endswith("_R1"): + sample = s.replace("_R1", "") + if sample not in data: + data[sample] = [0, 0] + data[sample][0] = fq + if s.endswith("_R2"): + sample = s.replace("_R2", "") + if sample not in data: + data[sample] = [0, 0] + data[sample][1] = fq + for key, fqs in data.items(): + if len(fqs) != 2: + logger.error( + "for %s there is not paired fastq files, only %s found" % + (key, ",".join(fqs))) + del data[key] + return data + + +def getRCSeq(seq): + """ + Get the reverse complementary sequence + """ + tab = str.maketrans("ACTG","TGAC") + return seq.translate( tab )[::-1] + + +def findLinker(seq, linker): + """ + Match the linker in the read sequence. + """ + linkerrc = getRCSeq(linker) + pos = -1 + for i in range(len(seq) - 9): + seed = seq[i:i + 9] + if linker.startswith(seed): + pos = i + break + if linkerrc.startswith(seed): + pos = i + break + return pos + + +def checkStarts(seq): + """ + Check the starts + """ + flag = False + ss = ["CATG", "AATT", "NATG", "NATT"] + for s in ss: + if seq.startswith(s): + flag = True + break + return flag + + +def cutLinker(fq1, fq2, pre, rlen=10, linker="CTGTCTCTTATACACATCT"): + """ + Cut linkers and filter too short reads + """ + sample = pre.split("/")[-1] + nf1 = pre + "_R1.fastq.gz" + nf2 = pre + "_R2.fastq.gz" + if os.path.isfile(nf1) and os.path.isfile(nf2): + print("%s has been generated, return" % pre) + return None + fouts = { + "fo_r1": gzip.open(nf1, "wt"), + "fo_r2": gzip.open(nf2, "wt"), + } + #processing pairing fastqs + with gzip.open(fq1, "rt") as f1, gzip.open(fq2, "rt") as f2: + i = 0 + j = 0 + for r1, r2 in zip(FastqGeneralIterator(f1), FastqGeneralIterator(f2)): + r1, r2 = list(r1), list(r2) + i += 1 + if i % 100000 == 0: + print("%s reads processed for %s" % (i, pre)) + #check the starts + """ + if not (checkStarts(r1[1]) and checkStarts(r2[1])): + continue + if r1[1][0] == "N": + r1[1] = r1[1][1:] + r1[2] = r1[2][1:] + if r2[1][0] == "N": + r2[1] = r2[1][1:] + r2[2] = r2[2][1:] + """ + #check the linker + r1pos = findLinker(r1[1], linker) + r2pos = findLinker(r2[1], linker) + #trim reads + if r1pos != -1: + r1[1] = r1[1][:r1pos] + r1[2] = r1[2][:r1pos] + if r2pos != -1: + r2[1] = r2[1][:r2pos] + r2[2] = r2[2][:r2pos] + rid = "_".join(list(map(str, [i, r1pos, r2pos]))) + r1[0] = rid + r2[0] = rid + if len(r1[1]) >= rlen and len(r2[1]) >= rlen: + j += 1 + fouts["fo_r1"].write("@%s\n%s\n+\n%s\n" % + (r1[0], r1[1], r1[2])) + fouts["fo_r2"].write("@%s\n%s\n+\n%s\n" % + (r2[0], r2[1], r2[2])) + return sample, i, j, nf1, nf2 + + +def tracMapping(sample, fqs, ref, outdir, cpus=25): + """ + Mapping settings for Trac-looping data. + """ + logger.info("Start mapping %s.\n" % sample) + od = os.path.join(outdir, sample) + if not os.path.exists(od): + os.makedirs(od, exist_ok=True) + sam = od + "/" + sample + ".sam" + bam = od + "/" + sample + ".bam" + if os.path.isfile(sam): + logger.error("%s:%s exists, return." % (sample, sam)) + return None + if os.path.isfile(bam): + logger.error("%s:%s exists, return." % (sample, bam)) + return None + doBowtie = "bowtie2 -p {cpus} -q --end-to-end --very-sensitive -x {ref} -1 {fq1} -2 {fq2} -S {sam}".format( + cpus=cpus, ref=ref, fq1=fqs[0], fq2=fqs[1], sam=sam) + logger.info(doBowtie) + stat, output = subprocess.getstatusoutput(doBowtie) + #trim with "Warning" + output = output.split("\n") + output = [t for t in output if not t.startswith("Warning")] + output = "\n".join(output) + logger.info("FLAG_A:" + sample + "\n" + output + "\nFLAG_A\n") + lines = output.split("\n") + totalReads = int(lines[1].split(";")[0].split()[0]) + mapRatio = float(lines[-1].split("%")[0]) + return sample, sam + + +def getUniqueBedpe(f, fout): + """ + Get unique bedpe. Read id indicate the linker location. + """ + if os.path.isfile(fout): + return + print("Getting unique PETs from %s to %s" % (f, fout)) + redus = set() + with open(fout, "w") as fo: + for i, line in enumerate(open(f)): + line = line.split("\n")[0].split("\t") + if len(line) < 6: + continue + rid = list(map(int, line[6].split("_"))) + #for cis short reads, requiring the linkers + if line[0] == line[3]: + dis = abs((int(line[1]) + int(line[2])) / 2 - + (int(line[4]) + int(line[5])) / 2) + if dis < 1000 and rid[1] + rid[2] == -2: + continue + #for trans reads, requiring the linkers + if line[0] != line[3]: + if rid[1] + rid[2] == -2: + continue + #remove redudant PETs + r = hash(tuple(line[:6])) + if r in redus: + continue + else: + redus.add(r) + fo.write("\t".join(line) + "\n") + + +def sam2bamBedpe(sample, sam, mapq=10): + """ + SAM to BAM and bedpe file + """ + n = os.path.splitext(sam)[0] + bam = n + ".bam" + bedpeAll = n + "_all.bedpe" + bedpeUni = n + "_unique.bedpe" + #sam to bam, filtering mapq + samview = "samtools view -b -F 4 -@ 2 -q {mapq} -o {bam} {sam}".format( + mapq=mapq, bam=bam, sam=sam) + #sort by read name + samsort = "samtools sort -n -@ 2 {bam} -T {pre} -o {bam}".format( + bam=bam, pre=bam.replace(".bam", "")) + rmsam = "rm %s" % (sam) + cmds = [samview, samsort, rmsam] + callSys(cmds, logger) + bam2bedpe = "bamToBed -bedpe -i {bam} > {bedpe}".format(bam=bam, + bedpe=bedpeAll) + logger.info(bam2bedpe) + stat, output = subprocess.getstatusoutput(bam2bedpe) + getUniqueBedpe(bedpeAll, bedpeUni) + cmd = "gzip %s %s" % (bedpeAll, bedpeUni) + callSys([cmd], logger) + return sample, bedpeAll + ".gz", bedpeUni + ".gz" + + +def sParseBowtie(lines): + """ + Parse Bowtie2 log file, to obtain mapping stastics. + """ + d, s = None, None + lines = lines.split("\n") + s = lines[0] + totalReads = int(lines[1].split(";")[0].split()[0]) + d1 = lines[4].strip().split() + conUniqueMappedReads = int(d1[0]) + d2 = lines[8].strip().split() + unconUniqueMappedReads = int(d2[0]) + #mapRatio = float(lines[15].split("%")[0]) + mapRatio = float(lines[-2].split("%")[0]) + d = { + "TotalRawReads": totalReads, + #"ConcordantlyUniqueMapReads": conUniqueMappedReads, + #"DisconcordantlyUniqueMapReads": unconUniqueMappedReads, + "MappingRatio(%s)": mapRatio + #"MultipleMapReads": multipleMappedReads, + #"MultipleMapRatio": multipleMappedRatio, + } + return d, s + + +def parseBowtielog(logs): + data = {} + for log in logs: + lines = open(log).read().split("FLAG_A\n") + lines = [line for line in lines if "FLAG_A" in line] + for line in lines: + t = line.split("FLAG_A:")[1] + d, s = sParseBowtie(t) + data[s] = d + data = pd.DataFrame(data).T + return data + + +def main(): + """ + Batch converting from bam to bedpe. + """ + #prepare everything + op = help() + date = time.strftime(' %Y-%m-%d', time.localtime(time.time())) + logger = getLogger(fn=op.output + "/" + date.strip() + "_" + os.path.basename(__file__) + ".log") + for t in ["bowtie2", "samtools", "bamToBed"]: + if not isTool(t): + logger.error("%s not exits! Please install through conda." % t) + return + if not os.path.exists(op.fqd): + logger.error("Input %s not exists! Return." % op.fqd) + return + if len(glob(op.ref + "*.bt2")) == 0: + logger.error("Bowtie2 reference not exists for prefix of %s! Return." % + op.ref) + return + if not os.path.exists(op.output): + os.makedirs(op.output, exist_ok=True) + else: + fs = glob(os.path.join(op.output, "*")) + if len(fs) > 0: + logger.info( + "Target output directory %s is not empty, may over-write some files." + % op.output) + data = preFqs(op.fqd) + if len(data) == 0: + logger.error( + "No matched _R1.fastq.gz and _R2.fastq.gz in %s. Return." % + (op.fqd)) + return + #prepare output dir + dirs = {} + for sample in data.keys(): + od = os.path.join(op.output, sample) + dirs[sample] = od + if not os.path.exists(od): + os.makedirs(od, exist_ok=True) + + #step 1, filter linkers + logger.info("Step1: Trim linkers and remove short sequences.") + ds = Parallel(n_jobs=op.number)( + delayed(cutLinker)(fqs[0], fqs[1], os.path.join(dirs[sample], sample)) + for sample, fqs in data.items()) + data = {} + for d in ds: + if d is not None: + data[d[0]] = { + "totalRaw": d[1], + "filterLinkers": d[2], + "f1": d[3], + "f2": d[4], + } + + #step2, mapping + logger.info("Step2: Map processed reads to genome.") + ref = op.ref + ds = Parallel(n_jobs=op.number, backend="multiprocessing")( + delayed(tracMapping)( + sample, [vs["f1"], vs["f2"]], ref, op.output, cpus=op.cpu) + for sample, vs in data.items()) + for d in ds: + if d is not None: + data[d[0]]["sam"] = d[1] + + #step3, convert to bam and bedpe files + #sam to bam and bedpe + logger.info("Step3: File type conversion. ") + cpus = op.number * op.cpu + ncpus = int(min(len(data), cpus / 2)) + ds = Parallel(n_jobs=ncpus, backend="multiprocessing")( + delayed(sam2bamBedpe)(sample, vs["sam"], op.mapq) + for sample, vs in data.items()) + + allBedpes = [] + uniBedpes = [] + for d in ds: + if d is not None: + data[d[0]]["allBedpe"] = d[1] + data[d[0]]["uniNonbgBedpe"] = d[2] + allBedpes.append(d[1]) + uniBedpes.append(d[2]) + data = pd.DataFrame(data).T + + #step 4, all PETs cLoops2 qc + logger.info("Step4: All mapped PETs QC. ") + cmd = "cLoops2 qc -f %s -o %s/allBedpeQc -p %s" % (",".join(allBedpes),op.output, + min(len(allBedpes), cpus)) + callSys([cmd], logger) + + #step 5, unqiue PETs cLoops2 qc + logger.info("Step5: Unique non-background PETs QC. ") + cmd = "cLoops2 qc -f %s -o %s/uniNonBgBedpeQc -p %s" % ( + ",".join(uniBedpes), op.output, min(len(uniBedpes), cpus)) + callSys([cmd], logger) + + #step 6, combine report + logger.info("Step5: Generate report. ") + logs = glob(op.output+"/*%s*.log"%os.path.basename(__file__)) + mata = parseBowtielog(logs) + + matb = pd.read_csv("%s/allBedpeQc_bedpeQc.txt"%op.output, index_col=0, sep="\t") + matb.index = [i.split("_all")[0] for i in matb.index] + matc = pd.read_csv("%s/uniNonBgBedpeQc_bedpeQc.txt"%op.output, index_col=0, sep="\t") + matc.index = [i.split("_unique")[0] for i in matc.index] + + for c in matb.columns: + mata[c] = matb[c] + mata.to_csv("%s/tracPre_summary.txt"%op.output, sep="\t") + + mat = {} + mat["total raw sequences"] = data["totalRaw"] + mat["after linker removing sequences"] = data["filterLinkers"] + mat["mapping ratio"] = mata["MappingRatio(%s)"] / 100 + + mat["total mapped PETs (mapq>=%s)" % op.mapq] = matb["TotalPETs"] + mat["total mapped PETs redundancy"] = matb["Redundancy"] + mat["total mapped PETs intra-chromosomal ratio"] = matb["cisRatio"] + mat["total mapped PETs close ratio (distance<=1kb)"] = matb["closeRatio"] + mat["total mapped PETs middle ratio (1kb=%s)" % op.mapq, + "total mapped PETs redundancy", + "total mapped PETs intra-chromosomal ratio", + "total mapped PETs close ratio (distance<=1kb)", + "total mapped PETs middle ratio (1kb - -

- - - ------- - ------- - ## Introduction - cLoops2 is an extension of our previous work, [cLoops](https://github.com/YaqiangCao/cLoops). From loop-calling based on assumption-free clustering to a full suite of analysis tools for 3D genomic interaction data, cLoops2 has been adapted specifically for data such as Hi-TrAC/Trac-looping, for which interactions are enriched over the genome through experimental steps. cLoops2 still supports Hi-C -like data, of which the interaction signals are evenly distributed at enzyme cutting sites. The changes from cLoops to cLoops2 are designed to address challenges around aiming for higher resolutions with the next-generation of genome architecture mapping technologies. - - cLoops2 is designed with respect reference to [bedtools](https://bedtools.readthedocs.io/en/latest/) and [Samtools](http://www.htslib.org/) for command-line style programming. If you have experience with them, you will find cLoops2 easy and efficient to use and combine commands, integrate as steps in your processing pipeline. - - Please refer to our [Hi-TrAC method manuscript]() or [cLoops2 manuscript](https://www.biorxiv.org/content/10.1101/2021.07.20.453068v1) for what cLoops2 can do and show. - - If you use cLoops2 in your research (the idea, the algorithm, the analysis scripts or the supplemental data), please give us a star on the GitHub repo page and cite our paper as follows: - - Preprint bioRxiv: [Yaqiang Cao et al. "cLoops2: a full-stack comprehensive analytical tool for chromatin interactions"](https://www.biorxiv.org/content/10.1101/2021.07.20.453068v1) - - - ------- - ------- - ## Install - #### 1. Easy way through pip for stable version - Python3 is requried. - ``` - pip install cLoops2 - ``` - - ------- - #### 2. Install from source with test data for latest version - cLoops2 is written purely in Python3 (cLoops was written in Python2). If you are familiar with [conda](https://docs.conda.io/en/latest/), cLoops2 can be installed easily with the following Linux shell commands (also tested well in win10 ubuntu subsystem, MacOS). - ``` - # for most updated code, or download the release version - git clone --depth=1 https://github.com/YaqiangCao/cLoops2 - cd cLoops2 - conda env create --name cLoops2 --file cLoops2_env.yaml - conda activate cLoops2 - python3 setup.py install - ``` - - Necessary Python3 third-party packages are listed below, all of which can be installed through conda. If you like to install cLoops2 through the old school way ***python setup.py install***, please install the 3rd dependencies first. - ``` - tqdm - numpy - scipy - pandas - sklearn - seaborn - pyBigWig - matplotlib - joblib - networkx - ``` - - After installation, whenever you want to run cLoops2, just activate the environment with conda: **conda activate cLoops2**. - Happy peak/loop-calling and have fun exploring all the other kinds of analyses. - - - ------ - ------ - ## cLoops2 Main Functions - Run ***cLoops2*** or ***cLoops2 -h*** can show the main functions of cLoops2 with short descriptions and examples. - ``` - An enhanced, accurate and flexible peak/domain/loop-calling and analysis tool - for 3D genomic interaction data. - - Use cLoops2 sub-command -h to see detail options and examples for sub-commands. - Available sub-commands are: - qc: quality control of BEDPE files before analysis. - pre: preprocess input BEDPE files into cLoops2 data. - update: update cLoops2 data files locations. - combine: combine multiple cLooops2 data directories. - dump: convert cLoops2 data files to others (BEDPE, HIC, washU, bedGraph and - contact matrix) - estEps: estimate eps using Gaussian mixture models or k-distance plot. - estRes: estimate reasonable contact matrix resolution based on signal - enrichment. - estDis: estimate significant interactions distance range. - estSat: estimate sequencing saturation based on contact matrix. - estSim: estimate similarities among samples based on contact matrix. - filterPETs: filter PETs based on peaks, loops, singleton mode or knn mode. - samplePETs: sample PETs according to specific target size. - callPeaks: call peaks for ChIP-seq, ATAC-seq, ChIC-seq and CUT&Tag or the - 3D genomic data such as Trac-looping, Hi-TrAC, HiChIP and more. - callLoops: call loops for 3D genomic data. - callDiffLoops: call differentially enriched loops for two datasets. - callDomains: call domains for 3D genomic data. - plot: plot the interaction matrix, genes, view point plot, 1D tracks, - peaks, loops and domains for a specific region. - montage: analysis of specific regions, producing Westworld Season 3 -like - Rehoboam plot. - agg: aggregated feature analysis and plots, features can be peaks, view - points, loops and domains. - quant: quantify peaks, loops and domains. - anaLoops: anotate loops for target genes. - findTargets: find target genes of genomic regions through networks from - anaLoops. - - Examples: - cLoops2 qc -f trac_rep1.bedpe.gz,trac_rep2.bedpe,trac_rep3.bedpe.gz \ - -o trac_stat -p 3 - cLoops2 pre -f ../test_GM12878_chr21_trac.bedpe -o trac - cLoops2 update -d ./trac - cLoops2 combine -ds ./trac1,./trac2,./trac3 -o trac_combined -keep 1 - cLoops2 dump -d ./trac -o trac -hic - cLoops2 estEps -d trac -o trac_estEps_gmm -p 10 -method gmm - cLoops2 estRes -d trac -o trac_estRes -p 10 -bs 25000,5000,1000,200 - cLoops2 estDis -d trac -o trac -plot -bs 1000 - cLoops2 estSim -ds Trac1,Trac2 -o trac_sim -p 10 -bs 2000 -m pcc -plot - cLoops2 filterPETs -d trac -peaks trac_peaks.bed -o trac_peaksFiltered -p 10 - cLoops2 samplePETs -d trac -o trac_sampled -t 5000000 -p 10 - cLoops2 callPeaks -d H3K4me3_ChIC -bgd IgG_ChIC -o H3K4me3_cLoops2 -eps 150 \ - -minPts 10 - cLoops2 callLoops -d Trac -eps 200,500,1000 -minPts 3 -filter -o Trac -w -j \ - -cut 2000 - cLoops2 callLoops -d HiC -eps 1000,5000,10000 -minPts 10,20,50,100 -w -j \ - -trans -o HiC_trans - cLoops2 callDiffLoops -tloop target_loop.txt -cloop control_loop.txt \ - -td ./target -cd ./control -o target_diff - cLoops2 callDomains -d trac -o trac -bs 10000 -ws 200000 - cLoops2 plot -f test/chr21-chr21.ixy -o test -bs 500 -start 34840000 \ - -end 34895000 -triu -1D -loop test_loops.txt -log \ - -gtf hg38.gtf -bws ctcf.bw -beds enhancer.bed - cLoops2 montage -f test/chr21-chr21.ixy -o test -bed test.bed - cLoops2 agg -d trac -loops trac.loop -peaks trac_peaks.bed \ - -domains hic_domains.bed -bws CTCF.bw,ATAC.bw -p 20 -o trac - cLoops2 quant -d trac -peaks trac_peaks.bed -loops trac.loop \ - -domains trac_domain.txt -p 20 -o trac - cLoops2 anaLoops -loops test_loop.txt -gtf gene.gtf -net -o test - cLoops2 findTargets -net test_ep_net.sif -tg test_targets.txt \ - -bed GWAS.bed -o test - More usages and examples are shown when run with cLoops2 sub-command -h. - - - optional arguments: - -h, --help show this help message and exit - -d PREDIR Assign data directory generated by cLoops2 pre to carry out analysis. - -o FNOUT Output data directory / file name prefix, default is cLoops2_output. - -p CPU CPUs used to run the job, default is 1, set -1 to use all CPUs - available. Too many CPU could cause out-of-memory problem if there are - too many PETs. - -cut CUT Distance cutoff to filter cis PETs, only keep PETs with distance - >=cut. Default is 0, no filtering. - -mcut MCUT Keep the PETs with distance <=mcut. Default is -1, no filtering. - -v Show cLoops2 verison number and exit. - --- Following are sub-commands specific options. This option just show - version of cLoops2. - - Bug reports are welcome and can be put as issue at github repo or sent to - caoyaqiang0410@gmail.com or yaqiang.cao@nih.gov. Thank you. - ``` - - ------ - ### 1. Quality control for BEDPE files - Run **cLoops2 qc -h** to see details. - ``` - Get the basic quality control statistical information from interaction BEDPE - files. - - Example: - cLoops2 qc -f trac_rep1.bedpe.gz,trac_rep2.bedpe,trac_rep3.bedpe.gz -p 3 \ - -o trac_stat - - - optional arguments: - -h, --help show this help message and exit - -d PREDIR Assign data directory generated by cLoops2 pre to carry out analysis. - -o FNOUT Output data directory / file name prefix, default is cLoops2_output. - -p CPU CPUs used to run the job, default is 1, set -1 to use all CPUs - available. Too many CPU could cause out-of-memory problem if there are - too many PETs. - -cut CUT Distance cutoff to filter cis PETs, only keep PETs with distance - >=cut. Default is 0, no filtering. - -mcut MCUT Keep the PETs with distance <=mcut. Default is -1, no filtering. - -v Show cLoops2 verison number and exit. - --- Following are sub-commands specific options. This option just show - version of cLoops2. - -f FNIN Input BEDPE file(s), .bedpe and .bedpe.gz are both suitable. Multiple - samples can be assigned as -f A.bedpe.gz,B.bedpe.gz,C.bedpe.gz. - ``` - - ------ - ### 2. Pre-process BEDPE into cLoops2 data - Run **cLoops2 pre -h** to see details. - ``` - Preprocess mapped PETs into cLoops2 data files. - - Support input file formats: - BEDPE: https://bedtools.readthedocs.io/en/latest/content/general-usage.html - PAIRS: https://pairtools.readthedocs.io/en/latest/formats.html#pairs - - The output directory contains one .json file for the basic statistics of PETs - information and .ixy files which are coordinates for every PET. The coordinate - files will be used to call peaks, loops or any other analyses implemented in - cLoops2. For data backup/sharing purposes, the directory can be saved as - .tar.gz file through tar. If changed and moved location, run - ***cLoops2 update -d*** to update. - - Examples: - 1. keep high quality PETs of chromosome chr21 - cLoops2 pre -f trac_rep1.bepee.gz,trac_rep2.bedpe.gz -o trac -c chr21 - - 2. keep all cis PETs that have distance > 1kb - cLoops2 pre -f trac_rep1.bedpe.gz,trac_rep2.bedpe.gz -o trac -mapq 0 - - - - optional arguments: - -h, --help show this help message and exit - -d PREDIR Assign data directory generated by cLoops2 pre to carry out analysis. - -o FNOUT Output data directory / file name prefix, default is cLoops2_output. - -p CPU CPUs used to run the job, default is 1, set -1 to use all CPUs - available. Too many CPU could cause out-of-memory problem if there are - too many PETs. - -cut CUT Distance cutoff to filter cis PETs, only keep PETs with distance - >=cut. Default is 0, no filtering. - -mcut MCUT Keep the PETs with distance <=mcut. Default is -1, no filtering. - -v Show cLoops2 verison number and exit. - --- Following are sub-commands specific options. This option just show - version of cLoops2. - -f FNIN Input BEDPE or PAIR file(s), .bedpe and .bedpe.gz are both suitable. - Replicates or multiple samples can be assigned as -f A.bedpe.gz, - B.bedpe.gz,C.bedpe.gz to get merged PETs. - -c CHROMS Argument to process limited set of chromosomes, specify it as chr1, - chr2,chr3. Use this option to filter reads from such as - chr22_KI270876v1. The default setting is to use the entire set of - chromosomes from the data. - -trans Whether to parse trans- (inter-chromosomal) PETs. The default is to - ignore trans-PETs. Set this flag to pre-process all PETs. - -mapq MAPQ MAPQ cutoff to filter raw PETs, default is >=10. This option is not - valid when input is .pairs file. - -format {bedpe,pairs} - cLoops2 currently supports BEDPE and PAIRs file format. Default is bedpe. - ``` - - ------ - ### 3. Update cLoops2 data directory - Run **cLoops2 update -h** to see details. - ``` - Update cLoops2 data files generated by **cLoops2 pre**. - - In the **cLoops2 pre** output directory, there is a .json file annotated with - the .ixy **absolute paths** and other information. So if the directory is - moved, or some .ixy files are removed or changed, this command is needed to - update the paths, otherwise the other analysis modules will not work. - - Example: - cLoops2 update -d ./Trac - - - optional arguments: - -h, --help show this help message and exit - -d PREDIR Assign data directory generated by cLoops2 pre to carry out analysis. - -o FNOUT Output data directory / file name prefix, default is cLoops2_output. - -p CPU CPUs used to run the job, default is 1, set -1 to use all CPUs - available. Too many CPU could cause out-of-memory problem if there are - too many PETs. - -cut CUT Distance cutoff to filter cis PETs, only keep PETs with distance - >=cut. Default is 0, no filtering. - -mcut MCUT Keep the PETs with distance <=mcut. Default is -1, no filtering. - -v Show cLoops2 verison number and exit. - --- Following are sub-commands specific options. This option just show - version of cLoops2. - ``` - - ------ - ### 4. Convert cLoops2 data to others - Run **cLoops2 dump -h** to see details. - ``` - Convert cLoops2 data files to other types. Currently supports BED file,BEDPE - file, HIC file, washU long-range track, bedGraph file and matrix txt file. - - Converting cLoops2 data to .hic file needs "juicer_tools pre" in the command - line enviroment. - Converting cLoops2 data to legacy washU browser long-range track needs bgzip - and tabix. Format reference: http://wiki.wubrowse.org/Long-range. - Converting cLoops2 data to UCSC bigInteract track needs bedToBigBed. Format - reference: https://genome.ucsc.edu/goldenPath/help/interact.html. - Converting cLoops2 data to bedGraph track will normalize value as RPM - (reads per million). Run with -bdg_pe flag for 1D data such as ChIC-seq, - ChIP-seq and ATAC-seq. - Converting cLoops2 data to matrix txt file will need specific resolution. - The output txt file can be loaded in TreeView for visualization or further - analysis. - - Examples: - 1. convert cLoops2 data to single-end .bed file fo usage of BEDtools or - MACS2 for peak-calling with close PETs - cLoops2 dump -d trac -o trac -bed -mcut 1000 - - 2. convert cLoops2 data to .bedpe file for usage of BEDtools, only keep - PETs distance >1kb and < 1Mb - cLoops2 dump -d trac -o trac -bedpe -bedpe_ext -cut 1000 -mcut 1000000 - - 3. convert cLoops2 data to .hic file to load in juicebox - cLoops2 dump -d trac -o trac -hic -hic_org hg38 \ - -hic_res 200000,20000,5000 - - 4. convert cLoops2 data to washU long-range track file, only keep PETs - distance > 1kb - cLoops2 dump -d trac -o trac -washU -washU_ext 50 -cut 1000 - - 5. convert cLoops2 data to UCSC bigInteract track file - cLoops2 dump -d trac -o trac -ucsc -ucsc_cs ./hg38.chrom.sizes - - 6. convert interacting cLoops2 data to bedGraph file with all PETs - cLoops2 dump -d trac -o trac -bdg -bdg_ext 100 - - 7. convert 1D cLoops2 data (such as ChIC-seq/ChIP-seq/ATAC-seq) to bedGraph - file - cLoops2 dump -d trac -o trac -bdg -pe - - 8. convert 3D cLoops2 data (such as Trac-looping) to bedGraph file for peaks - cLoops2 dump -d trac -o trac -bdg -mcut 1000 - - 9. convert one region in chr21 to contact matrix correlation matrix txt file - cLoops2 dump -d test -mat -o test -mat_res 10000 \ - -mat_chrom chr21-chr21 -mat_start 36000000 \ - -mat_end 40000000 -log -corr - - - optional arguments: - -h, --help show this help message and exit - -d PREDIR Assign data directory generated by cLoops2 pre to carry out analysis. - -o FNOUT Output data directory / file name prefix, default is cLoops2_output. - -p CPU CPUs used to run the job, default is 1, set -1 to use all CPUs - available. Too many CPU could cause out-of-memory problem if there are - too many PETs. - -cut CUT Distance cutoff to filter cis PETs, only keep PETs with distance - >=cut. Default is 0, no filtering. - -mcut MCUT Keep the PETs with distance <=mcut. Default is -1, no filtering. - -v Show cLoops2 verison number and exit. - --- Following are sub-commands specific options. This option just show - version of cLoops2. - -bed Convert data to single-end BED file. - -bed_ext BED_EXT Extension from the center of the read to both ends for BED file. - Default is 50. - -bedpe Convert data to BEDPE file. - -bedpe_ext BEDPE_EXT Extension from the center of the PET to both ends for BEDPE file. - Default is 50. - -hic Convert data to .hic file. - -hic_org HIC_ORG Organism required to generate .hic file,default is hg38. If the - organism is not available, assign a chrom.size file. - -hic_res HIC_RES Resolutions used to generate .hic file. Default is 1000,5000,25000, - 50000,100000,200000. - -washU Convert data to legacy washU browser long-range track. - -washU_ext WASHU_EXT Extension from the center of the PET to both ends for washU track. - Default is 50. - -ucsc Convert data to UCSC bigInteract file track. - -ucsc_ext UCSC_EXT Extension from the center of the PET to both ends for ucsc - track. Default is 50. - -ucsc_cs UCSC_CS A chrom.sizes file. Can be obtained through fetchChromSizese. - Required for -ucsc option. - -bdg Convert data to 1D bedGraph track file. - -bdg_ext BDG_EXT Extension from the center of the PET to both ends for - bedGraph track. Default is 50. - -bdg_pe When converting to bedGraph, argument determines whether to treat PETs - as ChIP-seq, ChIC-seq or ATAC-seq paired-end libraries. Default is not. - PETs are treated as single-end library for interacting data. - -mat Convert data to matrix txt file with required resolution. - -mat_res MAT_RES Bin size/matrix resolution (bp) to generate the contact matrix. - Default is 5000 bp. - -mat_chrom CHROM The chrom-chrom set will be processed. Specify it as chr1-chr1. - -mat_start START Start genomic coordinate for the target region. Default will be the - smallest coordinate from specified chrom-chrom set. - -mat_end END End genomic coordinate for the target region. Default will be the - largest coordinate from specified chrom-chrom set. - -log Whether to log transform the matrix. Default is not. - -m {obs,obs/exp} The type of matrix, observed matrix or observed/expected matrix, - expected matrix will be generated by shuffling PETs. Default is - observed. - -corr Whether to get the correlation matrix. Default is not. - -norm Whether to normalize the matrix with z-score. Default is not. - - ``` - - - ------ - ### 5. Estimate eps - Run **cLoops2 estEps -h** to see details. - ``` - Estimate key parameter eps. - - Two methods are implemented: 1) unsupervised Gaussian mixture model (gmm), and - 2) k-distance plot (k-dis,-k needed). Gmm is based on the assumption that PETs - can be classified into self-ligation (peaks) and inter-ligation (loops). K-dis - is based on the k-nearest neighbors distance distribution to find the "knee", - which is where the distance (eps) between neighbors has a sharp increase along - the k-distance curve. K-dis is the traditional approach literatures, but it is - much more time consuming than gmm, and maybe only fit to small cases. If both - methods do not give nice plots, please turn to the empirical parameters you - like, such as 100,200 for ChIP-seq -like data, 5000,1000 for Hi-C and etc. - - Examples: - 1. estimate eps with Gaussian mixture model - cLoops2 estEps -d trac -o trac_estEps_gmm -p 10 -method gmm - - 2. estimate eps with k-nearest neighbors distance distribution - cLoops2 estEps -d trac -o trac_estEps_kdis -p 10 -method k-dis -k 5 - - - optional arguments: - -h, --help show this help message and exit - -d PREDIR Assign data directory generated by cLoops2 pre to carry out analysis. - -o FNOUT Output data directory / file name prefix, default is cLoops2_output. - -p CPU CPUs used to run the job, default is 1, set -1 to use all CPUs - available. Too many CPU could cause out-of-memory problem if there are - too many PETs. - -cut CUT Distance cutoff to filter cis PETs, only keep PETs with distance - >=cut. Default is 0, no filtering. - -mcut MCUT Keep the PETs with distance <=mcut. Default is -1, no filtering. - -v Show cLoops2 verison number and exit. - --- Following are sub-commands specific options. This option just show - version of cLoops2. - -fixy FIXY Assign the .ixy file to estimate eps inside of the whole directory - generated by cLoops2 pre. For very large data, especially Hi-C, this - option is recommended for chr1 (or the smaller one) to save time. - -k KNN The k-nearest neighbors used to draw the k-distance plot. Default is 0 - (not running), set this when -method k-dis. Suggested 5 for - ChIA-PET/Trac-looping data, 20 or 30 for Hi-C like data. - -method {gmm,k-dis} Two methods can be chosen to estimate eps. Default is Gmm. See above - for difference of the methods. - - ``` - - ------ - ### 6. Estimate reasonable contact matrix resolution - Run **cLoops2 estRes -h** to see details. - ``` - Estimate reasonable genome-wide contact matrix resolution based on signal - enrichment. - - PETs will be assigned to contact matrix bins according to input resolution. A - bin is marked as [nx,ny], and a PET is assigned to a bin by nx = int((x-s)/bs), - ny = int((y-s)/bs), where s is the minimal coordinate for all PETs and bs is - the bin size. Self-interaction bins (nx=ny) will be ignored. The bins only - containing singleton PETs are assumed as noise. - - The output is a PDF plot, for each resolution, a line is separated into two - parts: 1) dash line indicated linear increased trend of singleton PETs/bins; 2) - solid thicker line indicated non-linear increased trend of higher potential - signal PETs/bins. The higher the ratio of signal PETs/bins, the easier it it to - find loops in that resolution. The closer to the random line, the higher the - possibility to observe evenly distributed signals. - - We expect the highest resolution with >=50% PETs are not singletons. - - Example: - cLoops2 estRes -d trac -o trac -bs 10000,5000,1000 -p 20 - - optional arguments: - -h, --help show this help message and exit - -d PREDIR Assign data directory generated by cLoops2 pre to carry out analysis. - -o FNOUT Output data directory / file name prefix, default is cLoops2_output. - -p CPU CPUs used to run the job, default is 1, set -1 to use all CPUs - available. Too many CPU could cause out-of-memory problem if there are - too many PETs. - -cut CUT Distance cutoff to filter cis PETs, only keep PETs with distance - >=cut. Default is 0, no filtering. - -mcut MCUT Keep the PETs with distance <=mcut. Default is -1, no filtering. - -v Show cLoops2 verison number and exit. - --- Following are sub-commands specific options. This option just show - version of cLoops2. - -bs BINSIZE Candidate contact matrix resolution (bin size) to estimate signal - enrichment. A series of comma-separated values or a single value can - be used as input. For example,-bs 1000,5000,10000. Default is 5000. - - ``` - - ------ - ### 7. Estimate significant interaction distance range - Run **cLoops2 estDis -h** to see details. - ``` - Estimate the significant interaction distance limitation by getting the observed - and expected random background of the genomic distance vs interaction frequency. - - Example: - cLoops2 estDis -d trac -o trac -bs 5000 -p 20 -plot - - - optional arguments: - -h, --help show this help message and exit - -d PREDIR Assign data directory generated by cLoops2 pre to carry out analysis. - -o FNOUT Output data directory / file name prefix, default is cLoops2_output. - -p CPU CPUs used to run the job, default is 1, set -1 to use all CPUs +Keywords: peak-calling loop-calling Hi-Trac interaction visualization +Classifier: Environment :: Console +Classifier: Operating System :: MacOS :: MacOS X +Classifier: Operating System :: POSIX +Classifier: Topic :: Scientific/Engineering :: Bio-Informatics +Requires-Python: >=3 +Description-Content-Type: text/markdown +License-File: LICENSE + +## cLoops2: full stack analysis tool for chromatin interactions +

+ +

+ + +------- +------- +## Introduction +cLoops2 is an extension of our previous work, [cLoops](https://github.com/YaqiangCao/cLoops). From loop-calling based on assumption-free clustering to a full suite of analysis tools for 3D genomic interaction data, cLoops2 has been adapted specifically for data such as Hi-TrAC/Trac-looping, for which interactions are enriched over the genome through experimental steps. cLoops2 still supports Hi-C -like data, of which the interaction signals are evenly distributed at enzyme cutting sites. The changes from cLoops to cLoops2 are designed to address challenges around aiming for higher resolutions with the next-generation of genome architecture mapping technologies. + +cLoops2 is designed with respect reference to [bedtools](https://bedtools.readthedocs.io/en/latest/) and [Samtools](http://www.htslib.org/) for command-line style programming. If you have experience with them, you will find cLoops2 easy and efficient to use and combine commands, integrate as steps in your processing pipeline. + +Please refer to our [Hi-TrAC method manuscript]() or [cLoops2 manuscript](https://www.biorxiv.org/content/10.1101/2021.07.20.453068v1) for what cLoops2 can do and show. + +If you use cLoops2 in your research (the idea, the algorithm, the analysis scripts or the supplemental data), please give us a star on the GitHub repo page and cite our paper as follows: + +Preprint bioRxiv: [Yaqiang Cao et al. "cLoops2: a full-stack comprehensive analytical tool for chromatin interactions"](https://www.biorxiv.org/content/10.1101/2021.07.20.453068v1) + + +------- +------- +## Install +#### 1. Easy way through pip for stable version +Python3 is requried. +``` +pip install cLoops2 +``` + +------- +#### 2. Install from source with test data for latest version +cLoops2 is written purely in Python3 (cLoops was written in Python2). If you are familiar with [conda](https://docs.conda.io/en/latest/), cLoops2 can be installed easily with the following Linux shell commands (also tested well in win10 ubuntu subsystem, MacOS). +``` +# for most updated code, or download the release version +git clone --depth=1 https://github.com/YaqiangCao/cLoops2 +cd cLoops2 +conda env create --name cLoops2 --file cLoops2_env.yaml +conda activate cLoops2 +python3 setup.py install +``` + +Necessary Python3 third-party packages are listed below, all of which can be installed through conda. If you like to install cLoops2 through the old school way ***python setup.py install***, please install the 3rd dependencies first. +``` +tqdm +numpy +scipy +pandas +sklearn +seaborn +pyBigWig +matplotlib +joblib +networkx +``` + +After installation, whenever you want to run cLoops2, just activate the environment with conda: **conda activate cLoops2**. +Happy peak/loop-calling and have fun exploring all the other kinds of analyses. + + +------ +------ +## cLoops2 Main Functions +Run ***cLoops2*** or ***cLoops2 -h*** can show the main functions of cLoops2 with short descriptions and examples. +``` +An enhanced, accurate and flexible peak/domain/loop-calling and analysis tool +for 3D genomic interaction data. + +Use cLoops2 sub-command -h to see detail options and examples for sub-commands. +Available sub-commands are: + qc: quality control of BEDPE files before analysis. + pre: preprocess input BEDPE files into cLoops2 data. + update: update cLoops2 data files locations. + combine: combine multiple cLooops2 data directories. + dump: convert cLoops2 data files to others (BEDPE, HIC, washU, bedGraph and + contact matrix) + estEps: estimate eps using Gaussian mixture models or k-distance plot. + estRes: estimate reasonable contact matrix resolution based on signal + enrichment. + estDis: estimate significant interactions distance range. + estSat: estimate sequencing saturation based on contact matrix. + estSim: estimate similarities among samples based on contact matrix. + filterPETs: filter PETs based on peaks, loops, singleton mode or knn mode. + samplePETs: sample PETs according to specific target size. + callPeaks: call peaks for ChIP-seq, ATAC-seq, ChIC-seq and CUT&Tag or the + 3D genomic data such as Trac-looping, Hi-TrAC, HiChIP and more. + callLoops: call loops for 3D genomic data. + callDiffLoops: call differentially enriched loops for two datasets. + callDomains: call domains for 3D genomic data. + plot: plot the interaction matrix, genes, view point plot, 1D tracks, + peaks, loops and domains for a specific region. + montage: analysis of specific regions, producing Westworld Season 3 -like + Rehoboam plot. + agg: aggregated feature analysis and plots, features can be peaks, view + points, loops and domains. + quant: quantify peaks, loops and domains. + anaLoops: anotate loops for target genes. + findTargets: find target genes of genomic regions through networks from + anaLoops. + +Examples: + cLoops2 qc -f trac_rep1.bedpe.gz,trac_rep2.bedpe,trac_rep3.bedpe.gz \ + -o trac_stat -p 3 + cLoops2 pre -f ../test_GM12878_chr21_trac.bedpe -o trac + cLoops2 update -d ./trac + cLoops2 combine -ds ./trac1,./trac2,./trac3 -o trac_combined -keep 1 + cLoops2 dump -d ./trac -o trac -hic + cLoops2 estEps -d trac -o trac_estEps_gmm -p 10 -method gmm + cLoops2 estRes -d trac -o trac_estRes -p 10 -bs 25000,5000,1000,200 + cLoops2 estDis -d trac -o trac -plot -bs 1000 + cLoops2 estSim -ds Trac1,Trac2 -o trac_sim -p 10 -bs 2000 -m pcc -plot + cLoops2 filterPETs -d trac -peaks trac_peaks.bed -o trac_peaksFiltered -p 10 + cLoops2 samplePETs -d trac -o trac_sampled -t 5000000 -p 10 + cLoops2 callPeaks -d H3K4me3_ChIC -bgd IgG_ChIC -o H3K4me3_cLoops2 -eps 150 \ + -minPts 10 + cLoops2 callLoops -d Trac -eps 200,500,1000 -minPts 3 -filter -o Trac -w -j \ + -cut 2000 + cLoops2 callLoops -d HiC -eps 1000,5000,10000 -minPts 10,20,50,100 -w -j \ + -trans -o HiC_trans + cLoops2 callDiffLoops -tloop target_loop.txt -cloop control_loop.txt \ + -td ./target -cd ./control -o target_diff + cLoops2 callDomains -d trac -o trac -bs 10000 -ws 200000 + cLoops2 plot -f test/chr21-chr21.ixy -o test -bs 500 -start 34840000 \ + -end 34895000 -triu -1D -loop test_loops.txt -log \ + -gtf hg38.gtf -bws ctcf.bw -beds enhancer.bed + cLoops2 montage -f test/chr21-chr21.ixy -o test -bed test.bed + cLoops2 agg -d trac -loops trac.loop -peaks trac_peaks.bed \ + -domains hic_domains.bed -bws CTCF.bw,ATAC.bw -p 20 -o trac + cLoops2 quant -d trac -peaks trac_peaks.bed -loops trac.loop \ + -domains trac_domain.txt -p 20 -o trac + cLoops2 anaLoops -loops test_loop.txt -gtf gene.gtf -net -o test + cLoops2 findTargets -net test_ep_net.sif -tg test_targets.txt \ + -bed GWAS.bed -o test + More usages and examples are shown when run with cLoops2 sub-command -h. + + +optional arguments: + -h, --help show this help message and exit + -d PREDIR Assign data directory generated by cLoops2 pre to carry out analysis. + -o FNOUT Output data directory / file name prefix, default is cLoops2_output. + -p CPU CPUs used to run the job, default is 1, set -1 to use all CPUs + available. Too many CPU could cause out-of-memory problem if there are + too many PETs. + -cut CUT Distance cutoff to filter cis PETs, only keep PETs with distance + >=cut. Default is 0, no filtering. + -mcut MCUT Keep the PETs with distance <=mcut. Default is -1, no filtering. + -v Show cLoops2 verison number and exit. + --- Following are sub-commands specific options. This option just show + version of cLoops2. + +Bug reports are welcome and can be put as issue at github repo or sent to +caoyaqiang0410@gmail.com or yaqiang.cao@nih.gov. Thank you. +``` + +------ +### 1. Quality control for BEDPE files +Run **cLoops2 qc -h** to see details. +``` +Get the basic quality control statistical information from interaction BEDPE +files. + +Example: + cLoops2 qc -f trac_rep1.bedpe.gz,trac_rep2.bedpe,trac_rep3.bedpe.gz -p 3 \ + -o trac_stat + + +optional arguments: + -h, --help show this help message and exit + -d PREDIR Assign data directory generated by cLoops2 pre to carry out analysis. + -o FNOUT Output data directory / file name prefix, default is cLoops2_output. + -p CPU CPUs used to run the job, default is 1, set -1 to use all CPUs + available. Too many CPU could cause out-of-memory problem if there are + too many PETs. + -cut CUT Distance cutoff to filter cis PETs, only keep PETs with distance + >=cut. Default is 0, no filtering. + -mcut MCUT Keep the PETs with distance <=mcut. Default is -1, no filtering. + -v Show cLoops2 verison number and exit. + --- Following are sub-commands specific options. This option just show + version of cLoops2. + -f FNIN Input BEDPE file(s), .bedpe and .bedpe.gz are both suitable. Multiple + samples can be assigned as -f A.bedpe.gz,B.bedpe.gz,C.bedpe.gz. +``` + +------ +### 2. Pre-process BEDPE into cLoops2 data +Run **cLoops2 pre -h** to see details. +``` +Preprocess mapped PETs into cLoops2 data files. + +Support input file formats: +BEDPE: https://bedtools.readthedocs.io/en/latest/content/general-usage.html +PAIRS: https://pairtools.readthedocs.io/en/latest/formats.html#pairs + +The output directory contains one .json file for the basic statistics of PETs +information and .ixy files which are coordinates for every PET. The coordinate +files will be used to call peaks, loops or any other analyses implemented in +cLoops2. For data backup/sharing purposes, the directory can be saved as +.tar.gz file through tar. If changed and moved location, run +***cLoops2 update -d*** to update. + +Examples: + 1. keep high quality PETs of chromosome chr21 + cLoops2 pre -f trac_rep1.bepee.gz,trac_rep2.bedpe.gz -o trac -c chr21 + + 2. keep all cis PETs that have distance > 1kb + cLoops2 pre -f trac_rep1.bedpe.gz,trac_rep2.bedpe.gz -o trac -mapq 0 + + + +optional arguments: + -h, --help show this help message and exit + -d PREDIR Assign data directory generated by cLoops2 pre to carry out analysis. + -o FNOUT Output data directory / file name prefix, default is cLoops2_output. + -p CPU CPUs used to run the job, default is 1, set -1 to use all CPUs + available. Too many CPU could cause out-of-memory problem if there are + too many PETs. + -cut CUT Distance cutoff to filter cis PETs, only keep PETs with distance + >=cut. Default is 0, no filtering. + -mcut MCUT Keep the PETs with distance <=mcut. Default is -1, no filtering. + -v Show cLoops2 verison number and exit. + --- Following are sub-commands specific options. This option just show + version of cLoops2. + -f FNIN Input BEDPE or PAIR file(s), .bedpe and .bedpe.gz are both suitable. + Replicates or multiple samples can be assigned as -f A.bedpe.gz, + B.bedpe.gz,C.bedpe.gz to get merged PETs. + -c CHROMS Argument to process limited set of chromosomes, specify it as chr1, + chr2,chr3. Use this option to filter reads from such as + chr22_KI270876v1. The default setting is to use the entire set of + chromosomes from the data. + -trans Whether to parse trans- (inter-chromosomal) PETs. The default is to + ignore trans-PETs. Set this flag to pre-process all PETs. + -mapq MAPQ MAPQ cutoff to filter raw PETs, default is >=10. This option is not + valid when input is .pairs file. + -format {bedpe,pairs} + cLoops2 currently supports BEDPE and PAIRs file format. Default is bedpe. +``` + +------ +### 3. Update cLoops2 data directory +Run **cLoops2 update -h** to see details. +``` +Update cLoops2 data files generated by **cLoops2 pre**. + +In the **cLoops2 pre** output directory, there is a .json file annotated with +the .ixy **absolute paths** and other information. So if the directory is +moved, or some .ixy files are removed or changed, this command is needed to +update the paths, otherwise the other analysis modules will not work. + +Example: + cLoops2 update -d ./Trac + + +optional arguments: + -h, --help show this help message and exit + -d PREDIR Assign data directory generated by cLoops2 pre to carry out analysis. + -o FNOUT Output data directory / file name prefix, default is cLoops2_output. + -p CPU CPUs used to run the job, default is 1, set -1 to use all CPUs + available. Too many CPU could cause out-of-memory problem if there are + too many PETs. + -cut CUT Distance cutoff to filter cis PETs, only keep PETs with distance + >=cut. Default is 0, no filtering. + -mcut MCUT Keep the PETs with distance <=mcut. Default is -1, no filtering. + -v Show cLoops2 verison number and exit. + --- Following are sub-commands specific options. This option just show + version of cLoops2. +``` + +------ +### 4. Convert cLoops2 data to others +Run **cLoops2 dump -h** to see details. +``` +Convert cLoops2 data files to other types. Currently supports BED file,BEDPE +file, HIC file, washU long-range track, bedGraph file and matrix txt file. + +Converting cLoops2 data to .hic file needs "juicer_tools pre" in the command +line enviroment. +Converting cLoops2 data to legacy washU browser long-range track needs bgzip +and tabix. Format reference: http://wiki.wubrowse.org/Long-range. +Converting cLoops2 data to UCSC bigInteract track needs bedToBigBed. Format +reference: https://genome.ucsc.edu/goldenPath/help/interact.html. +Converting cLoops2 data to bedGraph track will normalize value as RPM +(reads per million). Run with -bdg_pe flag for 1D data such as ChIC-seq, +ChIP-seq and ATAC-seq. +Converting cLoops2 data to matrix txt file will need specific resolution. +The output txt file can be loaded in TreeView for visualization or further +analysis. + +Examples: + 1. convert cLoops2 data to single-end .bed file fo usage of BEDtools or + MACS2 for peak-calling with close PETs + cLoops2 dump -d trac -o trac -bed -mcut 1000 + + 2. convert cLoops2 data to .bedpe file for usage of BEDtools, only keep + PETs distance >1kb and < 1Mb + cLoops2 dump -d trac -o trac -bedpe -bedpe_ext -cut 1000 -mcut 1000000 + + 3. convert cLoops2 data to .hic file to load in juicebox + cLoops2 dump -d trac -o trac -hic -hic_org hg38 \ + -hic_res 200000,20000,5000 + + 4. convert cLoops2 data to washU long-range track file, only keep PETs + distance > 1kb + cLoops2 dump -d trac -o trac -washU -washU_ext 50 -cut 1000 + + 5. convert cLoops2 data to UCSC bigInteract track file + cLoops2 dump -d trac -o trac -ucsc -ucsc_cs ./hg38.chrom.sizes + + 6. convert interacting cLoops2 data to bedGraph file with all PETs + cLoops2 dump -d trac -o trac -bdg -bdg_ext 100 + + 7. convert 1D cLoops2 data (such as ChIC-seq/ChIP-seq/ATAC-seq) to bedGraph + file + cLoops2 dump -d trac -o trac -bdg -pe + + 8. convert 3D cLoops2 data (such as Trac-looping) to bedGraph file for peaks + cLoops2 dump -d trac -o trac -bdg -mcut 1000 + + 9. convert one region in chr21 to contact matrix correlation matrix txt file + cLoops2 dump -d test -mat -o test -mat_res 10000 \ + -mat_chrom chr21-chr21 -mat_start 36000000 \ + -mat_end 40000000 -log -corr + + +optional arguments: + -h, --help show this help message and exit + -d PREDIR Assign data directory generated by cLoops2 pre to carry out analysis. + -o FNOUT Output data directory / file name prefix, default is cLoops2_output. + -p CPU CPUs used to run the job, default is 1, set -1 to use all CPUs + available. Too many CPU could cause out-of-memory problem if there are + too many PETs. + -cut CUT Distance cutoff to filter cis PETs, only keep PETs with distance + >=cut. Default is 0, no filtering. + -mcut MCUT Keep the PETs with distance <=mcut. Default is -1, no filtering. + -v Show cLoops2 verison number and exit. + --- Following are sub-commands specific options. This option just show + version of cLoops2. + -bed Convert data to single-end BED file. + -bed_ext BED_EXT Extension from the center of the read to both ends for BED file. + Default is 50. + -bedpe Convert data to BEDPE file. + -bedpe_ext BEDPE_EXT Extension from the center of the PET to both ends for BEDPE file. + Default is 50. + -hic Convert data to .hic file. + -hic_org HIC_ORG Organism required to generate .hic file,default is hg38. If the + organism is not available, assign a chrom.size file. + -hic_res HIC_RES Resolutions used to generate .hic file. Default is 1000,5000,25000, + 50000,100000,200000. + -washU Convert data to legacy washU browser long-range track. + -washU_ext WASHU_EXT Extension from the center of the PET to both ends for washU track. + Default is 50. + -ucsc Convert data to UCSC bigInteract file track. + -ucsc_ext UCSC_EXT Extension from the center of the PET to both ends for ucsc + track. Default is 50. + -ucsc_cs UCSC_CS A chrom.sizes file. Can be obtained through fetchChromSizese. + Required for -ucsc option. + -bdg Convert data to 1D bedGraph track file. + -bdg_ext BDG_EXT Extension from the center of the PET to both ends for + bedGraph track. Default is 50. + -bdg_pe When converting to bedGraph, argument determines whether to treat PETs + as ChIP-seq, ChIC-seq or ATAC-seq paired-end libraries. Default is not. + PETs are treated as single-end library for interacting data. + -mat Convert data to matrix txt file with required resolution. + -mat_res MAT_RES Bin size/matrix resolution (bp) to generate the contact matrix. + Default is 5000 bp. + -mat_chrom CHROM The chrom-chrom set will be processed. Specify it as chr1-chr1. + -mat_start START Start genomic coordinate for the target region. Default will be the + smallest coordinate from specified chrom-chrom set. + -mat_end END End genomic coordinate for the target region. Default will be the + largest coordinate from specified chrom-chrom set. + -log Whether to log transform the matrix. Default is not. + -m {obs,obs/exp} The type of matrix, observed matrix or observed/expected matrix, + expected matrix will be generated by shuffling PETs. Default is + observed. + -corr Whether to get the correlation matrix. Default is not. + -norm Whether to normalize the matrix with z-score. Default is not. + +``` + + +------ +### 5. Estimate eps +Run **cLoops2 estEps -h** to see details. +``` +Estimate key parameter eps. + +Two methods are implemented: 1) unsupervised Gaussian mixture model (gmm), and +2) k-distance plot (k-dis,-k needed). Gmm is based on the assumption that PETs +can be classified into self-ligation (peaks) and inter-ligation (loops). K-dis +is based on the k-nearest neighbors distance distribution to find the "knee", +which is where the distance (eps) between neighbors has a sharp increase along +the k-distance curve. K-dis is the traditional approach literatures, but it is +much more time consuming than gmm, and maybe only fit to small cases. If both +methods do not give nice plots, please turn to the empirical parameters you +like, such as 100,200 for ChIP-seq -like data, 5000,1000 for Hi-C and etc. + +Examples: + 1. estimate eps with Gaussian mixture model + cLoops2 estEps -d trac -o trac_estEps_gmm -p 10 -method gmm + + 2. estimate eps with k-nearest neighbors distance distribution + cLoops2 estEps -d trac -o trac_estEps_kdis -p 10 -method k-dis -k 5 + + +optional arguments: + -h, --help show this help message and exit + -d PREDIR Assign data directory generated by cLoops2 pre to carry out analysis. + -o FNOUT Output data directory / file name prefix, default is cLoops2_output. + -p CPU CPUs used to run the job, default is 1, set -1 to use all CPUs available. Too many CPU could cause out-of-memory problem if there are too many PETs. - -cut CUT Distance cutoff to filter cis PETs, only keep PETs with distance + -cut CUT Distance cutoff to filter cis PETs, only keep PETs with distance >=cut. Default is 0, no filtering. - -mcut MCUT Keep the PETs with distance <=mcut. Default is -1, no filtering. - -v Show cLoops2 verison number and exit. - --- Following are sub-commands specific options. This option just show + -mcut MCUT Keep the PETs with distance <=mcut. Default is -1, no filtering. + -v Show cLoops2 verison number and exit. + --- Following are sub-commands specific options. This option just show version of cLoops2. - -c CHROMS Whether to process limited chroms, specify it as chr1,chr2,chr3, - default is not. Use this to save time for quite big data. - -bs BINSIZE Bin size / contact matrix resolution (bp) to generate the contact - matrix for estimation, default is 5000 bp. - -r REPEATS The reapet times to shuffle PETs to get the mean expected background, - default is 10. - -plot Set to plot the result. - ``` - - ------ - ### 8. Filter PETs - Run **cLoops2 filterPETs -h** to see details - ``` - Filter PETs according to peaks/domains/loops/singletons/KNNs. - - If any end of the PETs overlap with features such as peaks or loops, the PET - will be kept. Filtering can be done before or after peak/loop-calling. Input - can be peaks or loops, but should not be be mixed. The -singleton mode is based - on a specified contact matrix resolution, if there is only one PET in the bin, - the singleton PETs will be filtered. The -knn is based on noise removing step - of blockDBSCAN. - - Examples: - 1. keep PETs overlapping with peaks - cLoops2 filterPETs -d trac -peaks peaks.bed -o trac_filtered - - 2. keep PETs that do not overlap with any blacklist regions - cLoops2 filterPETs -d trac -peaks bg.bed -o trac_filtered -iv - - 3. keep PETs that overlap with loop anchors - cLoops2 filterPETs -d trac -loops test_loops.txt -o trac_filtered - - 4. keep PETs that both ends overlap with loop anchors - cLoops2 filterPETs -d trac -loops test_loops.txt -o trac_filtered -both - - 5. keep non-singleton PETs based on 1kb contact matrix - cLoops2 filterPETs -d trac -o trac_filtered -singleton -bs 1000 - - 6. filter PETs based on blockDBSCAN knn noise removing - cLoops2 filterPETs -d trac -o trac_filtered -knn -eps 1000 -minPts 5 - - optional arguments: - -h, --help show this help message and exit - -d PREDIR Assign data directory generated by cLoops2 pre to carry out analysis. - -o FNOUT Output data directory / file name prefix, default is cLoops2_output. - -p CPU CPUs used to run the job, default is 1, set -1 to use all CPUs - available. Too many CPU could cause out-of-memory problem if there are - too many PETs. - -cut CUT Distance cutoff to filter cis PETs, only keep PETs with distance - >=cut. Default is 0, no filtering. - -mcut MCUT Keep the PETs with distance <=mcut. Default is -1, no filtering. - -v Show cLoops2 verison number and exit. - --- Following are sub-commands specific options. This option just show - version of cLoops2. - -peaks FBED BED file of genomic features (such as promoters, enhancers, ChIP-seq, - ATAC-seq peaks,TADs) to filter PETs. - -loops FLOOP The loop.txt file generated by cLoops2, can be loops or domains, to - filter PETs. - -gap GAP If the distance between two genomic features is <=gap, the two regions - will be combined. Default is 1. Set to >=1. - -singleton Whether to use singleton mode to filter PETs. Contact matrix - resolution with -bs is required. Singleton PETs in contact matrix bins - will be filtered. - -bs BINSIZE The contact matrix bin size for -singleton mode filtering. Default is - 5000. - -knn Whether to use noise removing method in blockDBSCAN to filter PETs, - -eps and -minPts are required. - -eps EPS Same to callPeaks and callLoops, only used to filter PETs for -knn - mode. Default is 1000. Only one value is supported. - -minPts MINPTS Same to callPeaks and callLoops, only used to filter PETs for -knn - mode. Default is 5. Only one value is supported. - -iv Whether to only keep PETs not in the assigned regions, behaves like - grep -v. - -both Whether to only keep PETs that both ends overlap with loop anchors. - Default is not. - ``` - - ------ - ### 9. Sampling PETs - Run **cLoops2 samplePETs -h** to see details. - ``` - Sampling PETs to target total size. - - If there are multiple sample libraries and the total sequencing depths vary a - lot, and you want to compare the data fairly, it's better to sample them to - similar total PETs (either down-sampling or up-sampling), then call peaks/loops - with the same parameters. - - Example: - cLoops2 samplePETs -d trac -o trac_sampled -tot 5000000 -p 10 - - - optional arguments: - -h, --help show this help message and exit - -d PREDIR Assign data directory generated by cLoops2 pre to carry out analysis. - -o FNOUT Output data directory / file name prefix, default is cLoops2_output. - -p CPU CPUs used to run the job, default is 1, set -1 to use all CPUs + -fixy FIXY Assign the .ixy file to estimate eps inside of the whole directory + generated by cLoops2 pre. For very large data, especially Hi-C, this + option is recommended for chr1 (or the smaller one) to save time. + -k KNN The k-nearest neighbors used to draw the k-distance plot. Default is 0 + (not running), set this when -method k-dis. Suggested 5 for + ChIA-PET/Trac-looping data, 20 or 30 for Hi-C like data. + -method {gmm,k-dis} Two methods can be chosen to estimate eps. Default is Gmm. See above + for difference of the methods. + +``` + +------ +### 6. Estimate reasonable contact matrix resolution +Run **cLoops2 estRes -h** to see details. +``` +Estimate reasonable genome-wide contact matrix resolution based on signal +enrichment. + +PETs will be assigned to contact matrix bins according to input resolution. A +bin is marked as [nx,ny], and a PET is assigned to a bin by nx = int((x-s)/bs), +ny = int((y-s)/bs), where s is the minimal coordinate for all PETs and bs is +the bin size. Self-interaction bins (nx=ny) will be ignored. The bins only +containing singleton PETs are assumed as noise. + +The output is a PDF plot, for each resolution, a line is separated into two +parts: 1) dash line indicated linear increased trend of singleton PETs/bins; 2) +solid thicker line indicated non-linear increased trend of higher potential +signal PETs/bins. The higher the ratio of signal PETs/bins, the easier it it to +find loops in that resolution. The closer to the random line, the higher the +possibility to observe evenly distributed signals. + +We expect the highest resolution with >=50% PETs are not singletons. + +Example: + cLoops2 estRes -d trac -o trac -bs 10000,5000,1000 -p 20 + +optional arguments: + -h, --help show this help message and exit + -d PREDIR Assign data directory generated by cLoops2 pre to carry out analysis. + -o FNOUT Output data directory / file name prefix, default is cLoops2_output. + -p CPU CPUs used to run the job, default is 1, set -1 to use all CPUs + available. Too many CPU could cause out-of-memory problem if there are + too many PETs. + -cut CUT Distance cutoff to filter cis PETs, only keep PETs with distance + >=cut. Default is 0, no filtering. + -mcut MCUT Keep the PETs with distance <=mcut. Default is -1, no filtering. + -v Show cLoops2 verison number and exit. + --- Following are sub-commands specific options. This option just show + version of cLoops2. + -bs BINSIZE Candidate contact matrix resolution (bin size) to estimate signal + enrichment. A series of comma-separated values or a single value can + be used as input. For example,-bs 1000,5000,10000. Default is 5000. + +``` + +------ +### 7. Estimate significant interaction distance range +Run **cLoops2 estDis -h** to see details. +``` +Estimate the significant interaction distance limitation by getting the observed +and expected random background of the genomic distance vs interaction frequency. + +Example: + cLoops2 estDis -d trac -o trac -bs 5000 -p 20 -plot + + +optional arguments: + -h, --help show this help message and exit + -d PREDIR Assign data directory generated by cLoops2 pre to carry out analysis. + -o FNOUT Output data directory / file name prefix, default is cLoops2_output. + -p CPU CPUs used to run the job, default is 1, set -1 to use all CPUs + available. Too many CPU could cause out-of-memory problem if there are + too many PETs. + -cut CUT Distance cutoff to filter cis PETs, only keep PETs with distance + >=cut. Default is 0, no filtering. + -mcut MCUT Keep the PETs with distance <=mcut. Default is -1, no filtering. + -v Show cLoops2 verison number and exit. + --- Following are sub-commands specific options. This option just show + version of cLoops2. + -c CHROMS Whether to process limited chroms, specify it as chr1,chr2,chr3, + default is not. Use this to save time for quite big data. + -bs BINSIZE Bin size / contact matrix resolution (bp) to generate the contact + matrix for estimation, default is 5000 bp. + -r REPEATS The reapet times to shuffle PETs to get the mean expected background, + default is 10. + -plot Set to plot the result. +``` + +------ +### 8. Filter PETs +Run **cLoops2 filterPETs -h** to see details +``` +Filter PETs according to peaks/domains/loops/singletons/KNNs. + +If any end of the PETs overlap with features such as peaks or loops, the PET +will be kept. Filtering can be done before or after peak/loop-calling. Input +can be peaks or loops, but should not be be mixed. The -singleton mode is based +on a specified contact matrix resolution, if there is only one PET in the bin, +the singleton PETs will be filtered. The -knn is based on noise removing step +of blockDBSCAN. + +Examples: + 1. keep PETs overlapping with peaks + cLoops2 filterPETs -d trac -peaks peaks.bed -o trac_filtered + + 2. keep PETs that do not overlap with any blacklist regions + cLoops2 filterPETs -d trac -peaks bg.bed -o trac_filtered -iv + + 3. keep PETs that overlap with loop anchors + cLoops2 filterPETs -d trac -loops test_loops.txt -o trac_filtered + + 4. keep PETs that both ends overlap with loop anchors + cLoops2 filterPETs -d trac -loops test_loops.txt -o trac_filtered -both + + 5. keep non-singleton PETs based on 1kb contact matrix + cLoops2 filterPETs -d trac -o trac_filtered -singleton -bs 1000 + + 6. filter PETs based on blockDBSCAN knn noise removing + cLoops2 filterPETs -d trac -o trac_filtered -knn -eps 1000 -minPts 5 + +optional arguments: + -h, --help show this help message and exit + -d PREDIR Assign data directory generated by cLoops2 pre to carry out analysis. + -o FNOUT Output data directory / file name prefix, default is cLoops2_output. + -p CPU CPUs used to run the job, default is 1, set -1 to use all CPUs + available. Too many CPU could cause out-of-memory problem if there are + too many PETs. + -cut CUT Distance cutoff to filter cis PETs, only keep PETs with distance + >=cut. Default is 0, no filtering. + -mcut MCUT Keep the PETs with distance <=mcut. Default is -1, no filtering. + -v Show cLoops2 verison number and exit. + --- Following are sub-commands specific options. This option just show + version of cLoops2. + -peaks FBED BED file of genomic features (such as promoters, enhancers, ChIP-seq, + ATAC-seq peaks,TADs) to filter PETs. + -loops FLOOP The loop.txt file generated by cLoops2, can be loops or domains, to + filter PETs. + -gap GAP If the distance between two genomic features is <=gap, the two regions + will be combined. Default is 1. Set to >=1. + -singleton Whether to use singleton mode to filter PETs. Contact matrix + resolution with -bs is required. Singleton PETs in contact matrix bins + will be filtered. + -bs BINSIZE The contact matrix bin size for -singleton mode filtering. Default is + 5000. + -knn Whether to use noise removing method in blockDBSCAN to filter PETs, + -eps and -minPts are required. + -eps EPS Same to callPeaks and callLoops, only used to filter PETs for -knn + mode. Default is 1000. Only one value is supported. + -minPts MINPTS Same to callPeaks and callLoops, only used to filter PETs for -knn + mode. Default is 5. Only one value is supported. + -iv Whether to only keep PETs not in the assigned regions, behaves like + grep -v. + -both Whether to only keep PETs that both ends overlap with loop anchors. + Default is not. +``` + +------ +### 9. Sampling PETs +Run **cLoops2 samplePETs -h** to see details. +``` +Sampling PETs to target total size. + +If there are multiple sample libraries and the total sequencing depths vary a +lot, and you want to compare the data fairly, it's better to sample them to +similar total PETs (either down-sampling or up-sampling), then call peaks/loops +with the same parameters. + +Example: + cLoops2 samplePETs -d trac -o trac_sampled -tot 5000000 -p 10 + + +optional arguments: + -h, --help show this help message and exit + -d PREDIR Assign data directory generated by cLoops2 pre to carry out analysis. + -o FNOUT Output data directory / file name prefix, default is cLoops2_output. + -p CPU CPUs used to run the job, default is 1, set -1 to use all CPUs + available. Too many CPU could cause out-of-memory problem if there are + too many PETs. + -cut CUT Distance cutoff to filter cis PETs, only keep PETs with distance + >=cut. Default is 0, no filtering. + -mcut MCUT Keep the PETs with distance <=mcut. Default is -1, no filtering. + -v Show cLoops2 verison number and exit. + --- Following are sub-commands specific options. This option just show + version of cLoops2. + -tot TOT Target total number of PETs. +``` + +------ +### 10. Call peaks for 1D or 3D data +Run **cLoops2 callPeaks -h** to see details. +``` +Call peaks based on clustering. + +Well tested work for ChIP-seq, ChIC-seq, ATAC-seq, CUT&RUN -like or the 3D +genomic data such as Hi-TrAC/Trac-looping, ChIA-PET and HiChIP. + +There are three steps in the algorithm: 1) cluster the PETs to find +self-ligation clusters, which are candidate peaks; 2) estimate the significance +of candidate peaks with local background; 3) if given control data, further +compare candidate peaks to control data. If running multiple clusterings with +separated parameters, the clusters will be combined and callPeaks will output +the most significant one based on overlaps. + +Key parameters are -eps and -minPts, both are key parameters in the clustering +algorithm blockDBSCAN. Eps indicates the distance that define two points (PETs) +being neighbors, while minPts indicatess the minial number of points required +for a cluster to form. For sharp-peak like data (ATAC-seq, TF ChIC-seq), set +-eps small such as 100 or 150. For broad-peak like data, such as H3K27me3 +ChIP-seq and ChIC-seq, set -eps large as 500 or 1000. + +Eps will affect more than minPts for sensitivity. + +Examples: + 1. call peaks for Trac-looping + cLoops2 callPeaks -d trac -eps 100 -minPts 10 -o trac -p 10 + + 2. call peaks for sharp-peak like ChIC-seq without control data + cLoops2 callPeaks -d ctcf_chic -o ctcf_chic -p 10 + + 3. call peaks for broad-peak like ChIC-seq with IgG as control + cLoops2 callPeaks -d H3K27me3 -bgd IgG -eps 500,1000 -minPts 10 \ + -o H3K27me3 + + 4. call peaks for sharp-peak ChIC-seq with linear fitting scaled control + data + cLoops2 callPeaks -d ctcf -bgd IgG -eps 150 -minPts 10 -o ctcf -p 10\ + -bgm lf + + 5. call peaks with sentitive mode to get comprehensive peaks for CUT&TAG + cLoops2 callPeaks -d H3K27ac -bgd IgG -sen -p 10 + + 6. filter PETs first and then call peaks for H3K27ac HiChIP, resulting much + much accurate peaks + cLoops2 filterPETs -d h3k27ac_hichip -o h3k27ac_hichip_filtered -knn \ + -eps 500 -minPts 5 + cLoops2 callPeaks -d h3k27ac_hichip_filtered -eps 200,500 -minPts 10 \ + -p 10 + + 7. call peaks for interaction data as single-end data + cLoops2 callPeaks -d h3k27ac -o h3k27ac -split -eps 200,500 -minPts 10 \ + -p 10 + + 8. call differential peaks between WT and KO condition + cLoops2 callPeaks -d MLL4_WT -bgd MLL4_KO -o MLL4_WTvsKO -p 10 + cLoops2 callPeaks -d MLL4_KO -bgd MLL4_WT -o MLL4_KOvsWT -p 10 + + +optional arguments: + -h, --help show this help message and exit + -d PREDIR Assign data directory generated by cLoops2 pre to carry out analysis. + -o FNOUT Output data directory / file name prefix, default is cLoops2_output. + -p CPU CPUs used to run the job, default is 1, set -1 to use all CPUs available. Too many CPU could cause out-of-memory problem if there are too many PETs. - -cut CUT Distance cutoff to filter cis PETs, only keep PETs with distance + -cut CUT Distance cutoff to filter cis PETs, only keep PETs with distance >=cut. Default is 0, no filtering. - -mcut MCUT Keep the PETs with distance <=mcut. Default is -1, no filtering. - -v Show cLoops2 verison number and exit. - --- Following are sub-commands specific options. This option just show + -mcut MCUT Keep the PETs with distance <=mcut. Default is -1, no filtering. + -v Show cLoops2 verison number and exit. + --- Following are sub-commands specific options. This option just show version of cLoops2. - -tot TOT Target total number of PETs. - ``` - - ------ - ### 10. Call peaks for 1D or 3D data - Run **cLoops2 callPeaks -h** to see details. - ``` - Call peaks based on clustering. - - Well tested work for ChIP-seq, ChIC-seq, ATAC-seq, CUT&RUN -like or the 3D - genomic data such as Hi-TrAC/Trac-looping, ChIA-PET and HiChIP. - - There are three steps in the algorithm: 1) cluster the PETs to find - self-ligation clusters, which are candidate peaks; 2) estimate the significance - of candidate peaks with local background; 3) if given control data, further - compare candidate peaks to control data. If running multiple clusterings with - separated parameters, the clusters will be combined and callPeaks will output - the most significant one based on overlaps. - - Key parameters are -eps and -minPts, both are key parameters in the clustering - algorithm blockDBSCAN. Eps indicates the distance that define two points (PETs) - being neighbors, while minPts indicatess the minial number of points required - for a cluster to form. For sharp-peak like data (ATAC-seq, TF ChIC-seq), set - -eps small such as 100 or 150. For broad-peak like data, such as H3K27me3 - ChIP-seq and ChIC-seq, set -eps large as 500 or 1000. - - Eps will affect more than minPts for sensitivity. - - Examples: - 1. call peaks for Trac-looping - cLoops2 callPeaks -d trac -eps 100 -minPts 10 -o trac -p 10 - - 2. call peaks for sharp-peak like ChIC-seq without control data - cLoops2 callPeaks -d ctcf_chic -o ctcf_chic -p 10 - - 3. call peaks for broad-peak like ChIC-seq with IgG as control - cLoops2 callPeaks -d H3K27me3 -bgd IgG -eps 500,1000 -minPts 10 \ - -o H3K27me3 - - 4. call peaks for sharp-peak ChIC-seq with linear fitting scaled control - data - cLoops2 callPeaks -d ctcf -bgd IgG -eps 150 -minPts 10 -o ctcf -p 10\ - -bgm lf - - 5. call peaks with sentitive mode to get comprehensive peaks for CUT&TAG - cLoops2 callPeaks -d H3K27ac -bgd IgG -sen -p 10 - - 6. filter PETs first and then call peaks for H3K27ac HiChIP, resulting much - much accurate peaks - cLoops2 filterPETs -d h3k27ac_hichip -o h3k27ac_hichip_filtered -knn \ - -eps 500 -minPts 5 - cLoops2 callPeaks -d h3k27ac_hichip_filtered -eps 200,500 -minPts 10 \ - -p 10 - - 7. call peaks for interaction data as single-end data - cLoops2 callPeaks -d h3k27ac -o h3k27ac -split -eps 200,500 -minPts 10 \ - -p 10 - - 8. call differential peaks between WT and KO condition - cLoops2 callPeaks -d MLL4_WT -bgd MLL4_KO -o MLL4_WTvsKO -p 10 - cLoops2 callPeaks -d MLL4_KO -bgd MLL4_WT -o MLL4_KOvsWT -p 10 - - - optional arguments: - -h, --help show this help message and exit - -d PREDIR Assign data directory generated by cLoops2 pre to carry out analysis. - -o FNOUT Output data directory / file name prefix, default is cLoops2_output. - -p CPU CPUs used to run the job, default is 1, set -1 to use all CPUs - available. Too many CPU could cause out-of-memory problem if there are - too many PETs. - -cut CUT Distance cutoff to filter cis PETs, only keep PETs with distance - >=cut. Default is 0, no filtering. - -mcut MCUT Keep the PETs with distance <=mcut. Default is -1, no filtering. - -v Show cLoops2 verison number and exit. - --- Following are sub-commands specific options. This option just show - version of cLoops2. - -eps EPS Distance that defines two points (PETs) being neighbors, eps in - blockDBSCAN as key parameter, multiple eps can be assigned such as - 100,200,300 to run multiple clusterings, the results will be combined. - For callPeaks, the default is 100,200. If the data show much more broad - feature such as H3K27me3 and H3K4me1, increase it to 500,1000 or larger. - If expecting both narrow and broad peaks in the data, set -eps 100,200, - 500,1000. - -minPts MINPTS Points required in a cluster, minPts in blockDBSCAN, key parameter, - multiple minPts can be assigned such as 3,5 to run multiple - clusterings, the results will be combined. For callPeaks, the default - is 5. If the data have many reads, increasing minPts such as 10,20. - -pcut PCUT Bonferroni corrected poisson p-value cutoff to determine significant - peaks. Default is 1e-2. - -bgd BGD Assign control data (IgG, Input) directory generated by cLoops2 pre to - carry out analysis. Default is no background. - -bgm {ratio,lf} How to scale the target data with control data. Available options are - 'ratio' and 'lf'. 'ratio' is based on library size and 'lf' means - linear fitting for control and target candidate peaks nearby regions. - Default is 'lf'. The scaling factor estimated by lf usually is a little - larger than ratio. In other words, the higher the scaling factor, the - less sensitive the results. - -pseudo PSEUDO Pseudo counts for local background or control data to estimate the - significance of peaks if no PETs/reads in the background. Default is - 1. Set it larger for noisy data, 0 is recommend for very clean data - such as well prepared CUT&Tag. - -sen Whether to use sensitive mode to call peaks. Default is not. If only a - few peaks were called, while a lot more can be observed - from visualization, try this option. Adjust -pcut or filter by - yourself to select significant ones. - -split Whether to split paired-end as single end data to call peaks. Sometimes - works well for Trac-looping and HiChIP. - -splitExt SPLITEXT When run with -split, the extension to upstraem and downstream, - default is 50. - ``` - - - ------ - ### 11. Call loops - Run **cLoops2 callLoops -h** to see details. - ``` - Call loops based on clustering. - - Well tested work for Hi-TrAC/TrAC-looping, HiCHiP, ChIA-PET and Hi-C. - - Similar to call peaks, there are three main steps in the algorithm: 1) cluster - the PETs to find inter-ligation clusters, which are candidate loops; 2) - estimate the significance of candidate loops with permutated local background. - 3) If -hic option not selected, the loop anchors will be checked for peak-like - features, only peak-like anchors are kept. If running multiple clusterings, - the clusters will be combined and callLoops will output the most significant - one based on overlaps. - - Similar to callPeaks, keys parameters are -eps and -minPts. For sharp-peak like - interaction data, set -eps small such as 500,1000. For broad-peak like data, - such as H3K27ac HiChIP, set -eps big as 1000,2000. For Hi-C and HiChIP data, - bigger -minPts is also needed, such as 20,50. - - Please note that the blockDBSCAN implementation in cLoops2 is much more - sensitive than cDBSCAN in cLoops, so the same parameters can generate quite - different results. With -hic option, cDBSCAN will be used. - - Examples: - 1. call loops for Hi-TrAC/Trac-looping - cLoops2 callLoops -d trac -o trac -eps 200,500,1000,2000 -minPts 5 -w -j - - 2. call loops for Hi-TrAC/Trac-looping with filtering short distance PETs - and using maximal estimated distance cutoff - cLoops2 callLoops -d trac -o trac -eps 200,500,1000,2000 -minPts 5 \ - -cut 1000 -max_cut -w -j - - 3. call loops for Hi-TrAC/Trac-looping and get the PETs with any end - overlapping loop anchors - cLoops2 callLoops -d trac -o trac -eps 200,500,1000,2000 -minPts 5 -w \ - -j -filterPETs - - 4. call loops for high-resolution Hi-C like data - cLoops2 callLoops -d hic -o hic -eps 2000,5000,10000 -minPts 20,50 -w -j - - 5. call inter-chromosomal loops (for most data, there will be no significant - inter-chromosomal loops) - cLoops2 callLoops -d HiC -eps 5000 -minPts 10,20,50,100,200 -w -j -trans\ - -o HiC_trans - - - optional arguments: - -h, --help show this help message and exit - -d PREDIR Assign data directory generated by cLoops2 pre to carry out analysis. - -o FNOUT Output data directory / file name prefix, default is cLoops2_output. - -p CPU CPUs used to run the job, default is 1, set -1 to use all CPUs - available. Too many CPU could cause out-of-memory problem if there are - too many PETs. - -cut CUT Distance cutoff to filter cis PETs, only keep PETs with distance - >=cut. Default is 0, no filtering. - -mcut MCUT Keep the PETs with distance <=mcut. Default is -1, no filtering. - -v Show cLoops2 verison number and exit. - --- Following are sub-commands specific options. This option just show - version of cLoops2. - -eps EPS Distance that defines two points (PETs) being neighbors, eps in - blockDBSCAN as key parameter, multiple eps can be assigned such as - 200,500,1000,2000 to run multiple clusterings, the results will be - combined. No default value, please give the input. - -minPts MINPTS Points required in a cluster. minPts in blockDBSCAN is a key parameter. - Empirically 5 is good for TFs and histone modification ChIA-PET data - and Trac-looping. For data like HiChIP and Hi-C, set it larger, like - >=20. The input can be a series, and the final loops will have the - PETs>= max(minPts). - -plot Whether to plot estimated inter-ligation and self-ligation PETs - distance distribution. Default is not to generate a plot. - -i Whether to convert loops to UCSC Interact track to visualize in UCSC. - Default is not, set this flag to save. - -j Whether to convert loops to 2D feature annotations to visualize in - Juicebox. Default is not, set this flag to save. - -w Whether to save tracks of loops to visualize in legacy and new washU. - Default is not, set this flag to save two files. - -max_cut When running cLoops with multiple eps or minPts, multiple distance - cutoffs for self-ligation and inter-ligation PETs will be estimated - based on the overlaps of anchors. Default option is the minimal one - will be used to filter PETs for candidate loop significance test. - Set this flag to use maximal one, will speed up for significance test. - -hic Whether to use statistical cutoffs for Hi-C to output significant loops. - Default is not, set this option to enable. Additionally, with -hic - option, there is no check for anchors requiring they looking like peaks. - -filter Whether to filter raw PETs according to called loops. The filtered - PETs can show clear view of interactions or be used to call loops again. - -trans Whether to call trans- (inter-chromosomal) loops. Default is not, set - this flag to call. For most common cases, not recommended, only for - data there are obvious visible trans loops. - -emPair By default eps and minPts combinations will be used to run clustering. - With this option, for example eps=500,1000 and minPts=5,10, only (500,5) - and (1000,10) as parameters of clustering will be run. Input number of - eps and minPts should be same. - - ``` - - ------ - ### 12. Call differentially enriched intra-chromosomal loops - Run **cLoops2 callDiffLoops -h** to see details. - ``` - Call differentially enriched intra-chromosomal loops between two conditions. - - Similar to calling peaks with control data, calling differentially enriched - loops is based on scaled PETs and the Poisson test. There are three main steps - in the algorithm: 1) merge the overlapped loops, quantify them and their - permutated local background regions; 2) fit the linear transformation of - background target interaction density to control background data based on - MANorm2; therefore, if there are more than than two samples, others can be - scaled to the reference sample for quantitative comparison; 3) estimate the - fold change (M) cutoff and average (A) cutoff using the background data with - the control of FDR, assuming there should no differentially significant - interactions called from the background data; or using the assigned cutoffs; 4) - estimate the significance based on the Poisson test for transformed data, both - for the loop and loop anchors. For example, if transformed PETs for target is - 5, PETs for control is 3 while control nearby permutated background median is - 4, then for the Poisson test, lambda=4-1 is used to test the observed 5 to call - p-value. - - Example: - 1. classical usage - cLoops2 callDiffLoops -tloop target_loop.txt -cloop control_loop.txt \ - -td ./target -cd ./control -o target_diff - - 2. customize MA cutoffs - cLoops2 callDiffLoops -tloop target_loop.txt -cloop control_loop.txt \ - -td ./target -cd ./control -o target_diff -cutomize \ - -acut 5 -mcut 0.5 - - - optional arguments: - -h, --help show this help message and exit - -d PREDIR Assign data directory generated by cLoops2 pre to carry out analysis. - -o FNOUT Output data directory / file name prefix, default is cLoops2_output. - -p CPU CPUs used to run the job, default is 1, set -1 to use all CPUs - available. Too many CPU could cause out-of-memory problem if there are - too many PETs. - -cut CUT Distance cutoff to filter cis PETs, only keep PETs with distance - >=cut. Default is 0, no filtering. - -mcut MCUT Keep the PETs with distance <=mcut. Default is -1, no filtering. - -v Show cLoops2 verison number and exit. - --- Following are sub-commands specific options. This option just show - version of cLoops2. - -tloop TLOOP The target loops in _loop.txt file called by cLoops2. - -cloop CLOOP The control loops in _loop.txt file called by cLoops2. - -td TPRED The data directory generated by cLoops2 for target data. - -cd CPRED The data directory generated by cLoops2 for control data. - -pcut PCUT Poisson p-value cutoff to determine significant differentially - enriched loops after Bonferroni correction , default is 1e-2. - -igp Ignore Poisson p-value cutoff and only using FDR to control MA plot - cutoffs. - -noPCorr Do not performe Bonferroni correction of Poisson p-values. Will get - more loops. Default is always performing. - -fdr FDR FDR cutoff for estimating fold change (M) and average value (A) after - normalization with background data. Default is 0.1. - -j Whether to convert loops to 2D feature annotations to visualize in - Juicebox. Default is not, set this flag to save. - -w Whether to save tracks of loops to visualize in legacy and new washU. - Default is not, set this flag to save two files. - -customize Whether to use cutomized cutoffs of MA plot. Defulat is not. If enable - -acut and -mcut is needed. - -cacut CACUT Average cutoff for MA plot of normalized PETs of loops. Assign when - -customize option used. - -cmcut CMCUT Fold change cutoff for MA plot of normalized PETs of loops. Assign when - -customize option used. - -vmin VMIN The minimum value shown in the heatmap and colorbar. - -vmax VMAX The maxmum value shown in the heatmap and colorbar. - -cmap {summer,red,div,cool} - The heatmap style. Default is summer. - - - ``` - - ------ - ### 13. Call domains - Run **cLoops2 callDomains -h** to see details. - ``` - Call domains for the 3D genomic data based on correlation matrix and local - segregation score. - - Well tested work for Hi-TrAC/Trac-looping data. - - Examples: - 1. call Hi-C like TADs - cLoops2 callDomains -d trac -o trac -bs 5000,10000 -ws 500000 -p 20 - - 2. call Hi-TrAC/Trac-looping specific small domains - cLoops2 callDomains -d trac -o trac -bs 1000 -ws 100000 -p 20 - - 3. call domains for Hi-C - cLoops2 callDomains -d hic -o hic -bs 10000 -ws 500000 -hic - - optional arguments: - -h, --help show this help message and exit - -d PREDIR Assign data directory generated by cLoops2 pre to carry out analysis. - -o FNOUT Output data directory / file name prefix, default is cLoops2_output. - -p CPU CPUs used to run the job, default is 1, set -1 to use all CPUs - available. Too many CPU could cause out-of-memory problem if there are - too many PETs. - -cut CUT Distance cutoff to filter cis PETs, only keep PETs with distance - >=cut. Default is 0, no filtering. - -mcut MCUT Keep the PETs with distance <=mcut. Default is -1, no filtering. - -v Show cLoops2 verison number and exit. - --- Following are sub-commands specific options. This option just show - version of cLoops2. - -bs BINSIZE Candidate contact matrix resolution (bin size) to call domains. A - series of values or a single value can be used as input. Default is - 10000. If given multiple values, callDomains will try to call nested - domains. Samll value may lead to samller domains. - -ws WINSIZE The half of the sliding window size used to caculate local correlation, - Default is 500000 (500kb). Larger value may lead to larger domains. - -hic Whether to use cutoffs for Hi-C to output significant domains. - Default is not. Set this option to enable, cutoffs will be more loose. - ``` - - ------ - ### 14. Plot the interaction as heatmap/scatter/arches, 1D signals, peaks, loops and domains - Run **cLoops2 plot -h** to see details. - ``` - Plot the interaction data as a heatmap (or arches/scatter) with additional of - virtual 4C view point, 1D tracks (bigWig files), 1D annotations (peaks, genes) - and 2D annotations (domains). If -f is not assigned, will just plot profiles - from bigWig file or bed files. - - Examples: - 1. plot the simple square heatmap for a specific region with 1kb resolution - with genes - cLoops2 plot -f test/chr21-chr21.ixy -o test -bs 1000 -start 34840000 \ - -end 34895000 -log -gtf test.gtf - - 2. plot the upper triangle heatmap with domains such as TAD and CTCF bigWig - track - cLoops2 plot -f test/chr21-chr21.ixy -o test_domain -bs 10000 \ - -start 34600000 -end 35500000 -domains HiC_TAD.bed -log \ - -triu -bws GM12878_CTCF_chr21.bw - - 3. plot the heatmap as upper triangle with 1D signal track and filter the - PETs shorter than 1kb - cLoops2 plot -f test/chr21-chr21.ixy -o test -bs 500 -start 34840000 \ - -end 34895000 -log -triu -1D -cut 1000 - - 4. plot the observation/expectation interaction heatmap with 1D signal - cLoops2 plot -f test/chr21-chr21.ixy -o test -m obs/exp -1D -triu \ - -bs 500 -start 34840000 -end 34895000 - - 5. plot the chromosome-wide correlation heatmap - cLoops2 plot -f test/chr21-chr21.ixy -o test -corr - - 6. plot upper triangle interaction heatmap together with genes, bigWig - files, peaks, loops, domains, control the heatmap scale - cLoops2 plot -f test/chr21-chr21.ixy -o test -bs 500 -start 34840000 \ - -end 34895000 -triu -bws ATAC.bw,CTCF.bw -1D \ - -loop test_loops.txt -beds Enh.bed,Tss.bed \ - -domains tad.bed -m obs -log -vmin 0.2 -vmax 2 -gtf genes.gtf - - 7. plot small regions interacting PETs as arches - cLoops2 plot -f test/chr21-chr21.ixy -o test -start 46228500 \ - -end 46290000 -1D -loops gm_loops.txt -arch -aw 0.05 - - 8. plot small regions interacting PETs as scatter plot - cLoops2 plot -f test/chr21-chr21.ixy -o test -start 46228500 \ - -end 46290000 -1D -loops gm_loops.txt -scatter - - 9. plot Hi-C compartments and eigenvector - cLoops2 plot -f test/chr21-chr21.ixy -o test -bs 100000 -log -corr -eig - - optional arguments: - -h, --help show this help message and exit - -d PREDIR Assign data directory generated by cLoops2 pre to carry out analysis. - -o FNOUT Output data directory / file name prefix, default is cLoops2_output. - -p CPU CPUs used to run the job, default is 1, set -1 to use all CPUs - available. Too many CPU could cause out-of-memory problem if there are - too many PETs. - -cut CUT Distance cutoff to filter cis PETs, only keep PETs with distance - >=cut. Default is 0, no filtering. - -mcut MCUT Keep the PETs with distance <=mcut. Default is -1, no filtering. - -v Show cLoops2 verison number and exit. - --- Following are sub-commands specific options. This option just show - version of cLoops2. - -f FIXY Input .ixy file generated by cLoops2 pre. If not assigned, no heatmaps - or arches will be shown and -chrom is needed to generate plots similar - to IGV or other browser. - -bs BINSIZE Bin size/matrix resolution (bp) to generate the contact matrix for - plotting, default is 5000 bp. - -chrom CHROM Chromosome for the target region if -f is not assigned. - -start START Start genomic coordinate for the target region. Default is 0. - -end END End genomic coordinate for the target region. Default is to infer - from the data. - -loops FLOOP The _loop.txt file generated by cLoops2, will be used to plot loops as - arches. - -loopCut LOOPCUT Only show loops with more than loopCut PETs. Default is 0. - -domains FDOMAIN The domains need to annotated in the heatmap such as TADs, should be - .bed file. - -beds BEDS BED tracks of genomic features to plot above the heatmap, such as - promoters and enhancers, track name will be inferred from file name, - for example enhancer.bed,promoter.bed. - -gtf GTF GTF track of genes to plot above the heatmap. - -bws BWS BigWig tracks to plot above the heatmap, track name will be inferred - from file name, for example a.bw,b.bw,c.bw. - -bwvs BWVS BigWig tracks y-axis limitations. Default is atuo-determined. Assign - as 'vmin,vmax;vmin,vmax;vmin,vmax'. For example, '0,1;;0,1' for three - bigWig tracks, as the second track kept atuo-determined. Due to - argparse limitation for parsing minus value, also can be assigned as - vmax,vmin. - -bwcs BWCS BigWig tracks colors. Default is atuo-determined. Assign as - 0,1,2 for three bigWig tracks. Values seperated by comma. - -log Whether to log transform the matrix. - -m {obs,obs/exp} The type of matrix to plot, observed matrix or observed/expected - matrix, expected matrix will be generated by shuffling PETs, default - is observed. - -corr Whether to plot the correlation matrix. Default is not. Correlation - heatmap will use dark mode color map, used together with obs method. - -norm Whether to normalize the matrix with z-score. - -triu Whether to rotate the heatmap only show upper triangle, default is - False. - -vmin VMIN The minimum value shown in the heatmap and colorbar. - -vmax VMAX The maxmum value shown in the heatmap and colorbar. - -1D Whether to plot the pileup 1D signal for the region. Default is not. - Please note, the 1D signal is aggregated from the visualization region. - If want to check the signal from each position of all genome/chromosome, - use cLoops2 dump -bdg to get the bigWig file. - -1Dv ONEDV 1D profile y-axis limitations. Default is auto-determined. Assign as - vmin,vmax, for example 0,1. - -virtual4C Whether to plot the virtual 4C view point 1D signal. Default is not. - If assigned, -view_start and -view_end are needed. - -view_start VIEWSTART - Start genomic coordinate for the view point start region, only valid - when -vitrutal4C is set, should >=start and <=end. - -view_end VIEWEND End genomic coordinate for the view point end region, only valid - when -vitrutal4C is set, should >=start and <=end. - -4Cv VIEWV Virtual 4C profile y-axis limitations. Default is auto-determined. - Assign as vmin,vmax, for example 0,1. - -arch Whether to plot interacting PETs as arches. Default is not. If - set, only original one PET one arch will be shown. Usefule to check - small region for raw data, especially when heatmap is not clear. - -aw AW Line width for each PET in arches plot. Default is 1. Try to - change it if too many or few PETs. - -ac AC Line color for each PET in arches plot. Default is 4. Try to - change it see how many colors are supported by cLoops2. - -aa AA Alpha to control arch color saturation. Default is 1. - -scatter Whether to plot interacting PETs as scatter dots. Default is not. - If set, only original one PET one dot will be shown. Usefule to check - raw data, especially when heatmap is not clear that -vmax is too small. - -ss SS Dot size for each PET in scatter plot. Default is 1. Try to - change it to optimize the plot. - -sc SC Dot color for each PET in scatter plot. Default is 0. Try to - change it see how many colors are supported by cLoops2. - -sa SA Alpha to control dot color saturation. Default is 1. - -eig Whether to plot the PC1 of correlation matirx to show compartments - Default is not. Only work well for big regions such as resolution - of 100k. - -eig_r Whether to flip the PC1 values of -eig. It should be dependend on - inactivate or activate histone markers, as actually the PCA values do - not have directions, especially comparing different samples. - -figWidth {4,8} Figure width. 4 is good to show the plot as half of a A4 figure - width and 8 is good to show more wider. Default is 4. - - - ``` - - ------ - ### 15. Montage analysis for regions of interactions - Run **cLoops2 montage -h** to see details. - ``` - Montage analysis of specific regions, producing Westworld Season 3 -like - Rehoboam plot. - - Examples: - 1. showing all PETs for a gene's promoter and enhancers - cLoops2 montage -f test/chr21-chr21.ixy -bed test.bed -o test - - 2. showing simplified PETs for a gene's promoter and enhancers - cLoops2 montage -f test/chr21-chr21.ixy -bed test.bed -o test -simple - - 3. adjust interacting link width - cLoops2 montage -f test/chr21-chr21.ixy -bed test.bed -o test -simple \ - -ppmw 10 - - 4. showing all PETs for a region, if in the bed file only contains one region - cLoops2 montage -f test/chr21-chr21.ixy -bed test.bed -o test -ext 0 - - - optional arguments: - -h, --help show this help message and exit - -d PREDIR Assign data directory generated by cLoops2 pre to carry out analysis. - -o FNOUT Output data directory / file name prefix, default is cLoops2_output. - -p CPU CPUs used to run the job, default is 1, set -1 to use all CPUs - available. Too many CPU could cause out-of-memory problem if there are - too many PETs. - -cut CUT Distance cutoff to filter cis PETs, only keep PETs with distance - >=cut. Default is 0, no filtering. - -mcut MCUT Keep the PETs with distance <=mcut. Default is -1, no filtering. - -v Show cLoops2 verison number and exit. - --- Following are sub-commands specific options. This option just show - version of cLoops2. - -f FIXY Input .ixy file generated by cLoops2 pre. - -bed BED Input .bed file for target regions, 4th columns should be id/name for - the region. - -ext EXT Up-stream and down-stream extesion of target region length. Default is - 2. If the input bed already include up/down-stream regions, assign as 0. - -simple Whether to only draw the representative interactions between two target - regions as one arch, and not include the interactions in extended - regions. Default is not, all interactions will be shown as archs.. - -vp VIEWPOINT Only show interactions with specific regions from all other regions. - Name/id (4th column in .bed file) is need. Default is to show all - releated interactions. Multiple names/ids can be assigned by seperation - of comma. - -vmin VMIN The minial scale for 1D pileup data. Default will be inferred from the - data. - -vmax VMAX The maxmial scale for 1D pileup data. Default will be inferred from the - data. - -ppmw PPMW Link line width indicator, short for 1 PETs per Million PETs line - width, default is 10. Adjust this value when -simple is used. Decrease - it if links are too bold and increase it when links are too thin. - -aw AW Line width for each PET if -simple is not selected. Default is 1. - -no1D Whether to not plot 1D profiles. Default is plot. Set this for Hi-C - like data. - ``` - - ------ - ### 16. Aggregation analysis for peaks, loops and domains - Run **cLoops2 agg -h** to see details. - ``` - Do the aggregation analysis for peaks, loops, view points and domains. - - The output figures can be used directly, and the data to generate the plot are - also saved for further customized analysis. - - For the aggregated peaks analysis,input is a .bed file annotated with the - coordinates for the target regions/peaks/anchors. Output is a .pdf file - containing a mean density plot and heatmap and a .txt file for the data. The - data in the .txt file and plot were normalized to RPM (reads per million). - - For the aggregated view points analysis, input is a .bed file annotated with - coordinates for the target regions/peaks/anchors as view point. Output is a - .pdf file containing a mean density plot and heatmap and a .txt file for the - data. The data in the .txt file and plot were normalized to - log2( RPM (reads per million)+1). - - For the aggregated loops analysis, input is a _loops.txt file annotated with - the coordinates for target loops, similar to the format of BEDPE. Output is a - .pdf file for mean heatmap and .npz file generated through numpy.savez for all - loops and nearby regions matrix. The enrichment score (ES) in the plot is - calculated as: ES = mean( (PETs in loop)/(mean PETs of nearby regions) ). Other - files except _loops.txt can be used as input, as long as the file contains key - information in the first columns separated by tabs: - loopId chrA startA endA chrB startB endB distance - loop-1 chr21 1000 2000 chr21 8000 9000 7000 - - There is another option for loops analysis, termed as two anchors. Input file is - same to aggregated loops analysis. The whole region with assigned extesion - between two anchors will be aggregated and 1D profile can show two anchors. The - analysis could be usefule to study/comapre different classes of anchors and - combinations, for example, considering CTCT motif directions, all left anchors - CTCF motifs are in positive strand and in negative strand for all right anchors. - It could be interesting for some loops one anchor only bound by transcription - factor a and another anchor only bound by transcription b. - - For the aggregated domains analysis, input is a .bed file annotated with the - coordinates for the domains, such as TADs. Output are a .pdf file for the upper - triangular heatmap and .npz file generated through numpy.savez for all domains - and nearby region matrix. The enrichment score (ES) in the plot is calculated - as mean( (two ends both with in domain PETs number)/( only one end in domain - PETs number) ). - - Examples: - 1. show aggregated peaks heatmap and profile - cLoops2 agg -d test -peaks peaks.bed -o test -peak_ext 2500 \ - -peak_bins 200 -peak_norm -skipZeros - - 2. show aggregated view points and aggregated bigWig signal - cLoops2 agg -d test -o test -viewPoints test_peaks.bed -bws CTCF.bw - - 3. show aggregated loops heatmap, 1D profile and aggregated bigWig signal - cLoops2 agg -d test -o test -loops test_loops.txt -bws CTCF.bw -1D \ - -loop_norm - - 3. show aggregated loops heatmap, 1D profile and aggregated bigWig signal - in two anchors mode - cLoops2 agg -d test -o test -twoAnchors test_loops.txt -bws CTCF.bw -1D \ - -loop_norm - - 4. show aggregated domains heatmap, 1D profile and aggregated bigWig signal - cLoops2 agg -d test -o test -domains TAD.bed -bws CTCF.bw -1D - - - optional arguments: - -h, --help show this help message and exit - -d PREDIR Assign data directory generated by cLoops2 pre to carry out analysis. - -o FNOUT Output data directory / file name prefix, default is cLoops2_output. - -p CPU CPUs used to run the job, default is 1, set -1 to use all CPUs - available. Too many CPU could cause out-of-memory problem if there are - too many PETs. - -cut CUT Distance cutoff to filter cis PETs, only keep PETs with distance - >=cut. Default is 0, no filtering. - -mcut MCUT Keep the PETs with distance <=mcut. Default is -1, no filtering. - -v Show cLoops2 verison number and exit. - --- Following are sub-commands specific options. This option just show - version of cLoops2. - -peaks PEAKF The .bed file for peaks-centric aggregation analysis. - -peak_ext PEAK_EXT The nearby upstream and downstream regions (bp) from the peak center. - Default is 5000. - -peak_bins PEAK_BINS The bin size for the profile array of peaks. Default is 100. - -peak_norm Whether to normalize the data in the peaks profile plot and - heatmap with row-wise z-score. Default is not. - -viewPoints VIEWPOINTF - The .bed file for view points -centric aggregation analysis. - -viewPointUp VIEWPOINTUP - The upstream regions included for the aggreaged view points analysis. - Default is 100000 bp. - -viewPointDown VIEWPOINTDOWN - The downstream regions included for the aggreaged view points analysis. - Default is 100000 bp. - -viewPointBs VIEWPOINTBS - Contact matrix bin size for view points heatmap. Default is 1000 bp. - -viewPoint_norm Whether to normalize the sub-matrix for each loop as divide the mean - PETs for the matrix. Default is not. - -loops LOOPF The _loop.txt file generated by cLoops2 for loops-centric - aggregation analysis. The file first 8 columns are necessary. - -loop_ext LOOP_EXT The nearby regions included to plot in the heatmap and calculation of - enrichment for aggregation loop analysis, default is 10, should be - even number. - -loop_cut LOOP_CUT Distance cutoff for loops to filter. Default is 0. - -loop_norm Whether to normalize the sub-matrix for each loop as divide the mean - PETs for the matrix (except the loop region). Default is not. - -twoAnchors TWOANCHORSF - The similar _loop.txt file generated by cLoops2 for two anchors - aggregation analysis. The file first 8 columns are necessary. - -twoAnchor_ext TWOANCHOR_EXT - The nearby regions of fold included to plot in heatmap. - Default is 0.1. - -twoAnchor_vmin TWOANCHOR_VMIN - The minimum value shown in the domain heatmap and colorbar. - -twoAnchor_vmax TWOANCHOR_VMAX - The maxmum value shown in the domain heatmap and colorbar. - -domains DOMAINF The .bed file annotated the domains such as TADs for aggregated - domains-centric analysis. - -domain_ext DOMAIN_EXT - The nearby regions of fold included to plot in heatmap and - caculation of enrichment, default is 0.5. - -domain_vmin DOMAIN_VMIN - The minimum value shown in the domain heatmap and colorbar. - -domain_vmax DOMAIN_VMAX - The maxmum value shown in the domain heatmap and colorbar. - -1D Whether to plot the pileup 1D signal for aggregated loops, - aggregated view points or aggregated domains. Default is not. - -bws BWS BigWig tracks to plot above the aggregated loops heatmap (or under - the aggregated domains heatmap), track name will be inferred from file - name, for example a.bw,b.bw,c.bw. - -skipZeros Whether to remove all 0 records. Default is not. - - ``` - - ------ - ### 17. Quantification of peaks, loops and domains - Run **cLoops2 quant -h** to see details. - ``` - Quantify the peaks, loops and domains. The output file will be the same as - outputs of callPeaks, callLoops and callDomains. - - Examples: - 1. quantify peaks - cLoops2 quant -d test -peaks peaks.bed -o test - - 2. quantify loops - cLoops2 quant -d test -loops test_loops.txt -o test - - 3. quantify domains - cLoops2 quant -d test -domains test_domains.txt -o test - - optional arguments: - -h, --help show this help message and exit - -d PREDIR Assign data directory generated by cLoops2 pre to carry out analysis. - -o FNOUT Output data directory / file name prefix, default is cLoops2_output. - -p CPU CPUs used to run the job, default is 1, set -1 to use all CPUs - available. Too many CPU could cause out-of-memory problem if there are - too many PETs. - -cut CUT Distance cutoff to filter cis PETs, only keep PETs with distance - >=cut. Default is 0, no filtering. - -mcut MCUT Keep the PETs with distance <=mcut. Default is -1, no filtering. - -v Show cLoops2 verison number and exit. - --- Following are sub-commands specific options. This option just show - version of cLoops2. - -peaks PEAKF The .bed file for peaks-centric quantification. - -loops LOOPF The _loop.txt file generated by cLoops2 for loops-centric - quantification, as long as there are first 8 columns. - -domains DOMAINF The _domains.txt file generated by cLoops2 for domains-centric - quantification, as long as there are first 3 columns - -domain_bs DOMAINBINSIZE - Candidate contact matrix resolution (bin size) to quantify domains, - default is 10000. Only one integer is supported. - -domain_ws DOMAINWINSIZE - The half window size used to calculate local correlation to quantify - domains. Default is 500000 (500kb). - -domain_bdg Whether to save the segregation score ad bedGraph file, default. - is not. - ``` - - ------ - ### 18. Annotation of loops to genes - Run **cLoops2 anaLoops -h** to see details. - ``` - Annotating loops: - - find the closest TSS for each loop anchors - - merge the loop anchors and classify them as enhancers or promoters based on - distance to nearest TSS - - build the interaction networks for merged anchors - - find the all interacted enhancers/promoters for each promoter - - Basic mode 1: with -gtf, loops will be annotated as enhancer or promoter based - on distance to nearest gene. If a anchor overlapped with two/multiple promoters - (often seen for close head-to-head genes), all will be reported. If no TSS - overlaps, then nearest one will be assigned. - - Basic mode 2: with -gtf -net, overlapped anchors will be merged and annoated as - enhancer or promoter considering distance to genes. For each promoter, all - linked enhancer and promoter will be shown. If there are more than 3 direct or - indirect enhancers for a promoter, HITS algorithm will be used to identify one - hub for indirect enhancer and one hub for indirect enhancer. - - Examples: - 1. annotate loops for target gene, basic mode 1 - cLoops2 anaLoops -loops test_loops.txt -gtf genecode.gtf - - 2. annotate loops for target transcripts (alternative TSS), basic mode 1 - cLoops2 anaLoops -loops test_loops.txt -gtf genecode.gtf -tid - - 3. find a gene's all linked enhancer or promoter, basic mode 2 - cLoops2 anaLoops -loops test_loops.txt -gtf genecode.gtf -net - - optional arguments: - -h, --help show this help message and exit - -d PREDIR Assign data directory generated by cLoops2 pre to carry out analysis. - -o FNOUT Output data directory / file name prefix, default is cLoops2_output. - -p CPU CPUs used to run the job, default is 1, set -1 to use all CPUs + -eps EPS Distance that defines two points (PETs) being neighbors, eps in + blockDBSCAN as key parameter, multiple eps can be assigned such as + 100,200,300 to run multiple clusterings, the results will be combined. + For callPeaks, the default is 100,200. If the data show much more broad + feature such as H3K27me3 and H3K4me1, increase it to 500,1000 or larger. + If expecting both narrow and broad peaks in the data, set -eps 100,200, + 500,1000. + -minPts MINPTS Points required in a cluster, minPts in blockDBSCAN, key parameter, + multiple minPts can be assigned such as 3,5 to run multiple + clusterings, the results will be combined. For callPeaks, the default + is 5. If the data have many reads, increasing minPts such as 10,20. + -pcut PCUT Bonferroni corrected poisson p-value cutoff to determine significant + peaks. Default is 1e-2. + -bgd BGD Assign control data (IgG, Input) directory generated by cLoops2 pre to + carry out analysis. Default is no background. + -bgm {ratio,lf} How to scale the target data with control data. Available options are + 'ratio' and 'lf'. 'ratio' is based on library size and 'lf' means + linear fitting for control and target candidate peaks nearby regions. + Default is 'lf'. The scaling factor estimated by lf usually is a little + larger than ratio. In other words, the higher the scaling factor, the + less sensitive the results. + -pseudo PSEUDO Pseudo counts for local background or control data to estimate the + significance of peaks if no PETs/reads in the background. Default is + 1. Set it larger for noisy data, 0 is recommend for very clean data + such as well prepared CUT&Tag. + -sen Whether to use sensitive mode to call peaks. Default is not. If only a + few peaks were called, while a lot more can be observed + from visualization, try this option. Adjust -pcut or filter by + yourself to select significant ones. + -split Whether to split paired-end as single end data to call peaks. Sometimes + works well for Trac-looping and HiChIP. + -splitExt SPLITEXT When run with -split, the extension to upstraem and downstream, + default is 50. +``` + + +------ +### 11. Call loops +Run **cLoops2 callLoops -h** to see details. +``` +Call loops based on clustering. + +Well tested work for Hi-TrAC/TrAC-looping, HiCHiP, ChIA-PET and Hi-C. + +Similar to call peaks, there are three main steps in the algorithm: 1) cluster +the PETs to find inter-ligation clusters, which are candidate loops; 2) +estimate the significance of candidate loops with permutated local background. +3) If -hic option not selected, the loop anchors will be checked for peak-like +features, only peak-like anchors are kept. If running multiple clusterings, +the clusters will be combined and callLoops will output the most significant +one based on overlaps. + +Similar to callPeaks, keys parameters are -eps and -minPts. For sharp-peak like +interaction data, set -eps small such as 500,1000. For broad-peak like data, +such as H3K27ac HiChIP, set -eps big as 1000,2000. For Hi-C and HiChIP data, +bigger -minPts is also needed, such as 20,50. + +Please note that the blockDBSCAN implementation in cLoops2 is much more +sensitive than cDBSCAN in cLoops, so the same parameters can generate quite +different results. With -hic option, cDBSCAN will be used. + +Examples: + 1. call loops for Hi-TrAC/Trac-looping + cLoops2 callLoops -d trac -o trac -eps 200,500,1000,2000 -minPts 5 -w -j + + 2. call loops for Hi-TrAC/Trac-looping with filtering short distance PETs + and using maximal estimated distance cutoff + cLoops2 callLoops -d trac -o trac -eps 200,500,1000,2000 -minPts 5 \ + -cut 1000 -max_cut -w -j + + 3. call loops for Hi-TrAC/Trac-looping and get the PETs with any end + overlapping loop anchors + cLoops2 callLoops -d trac -o trac -eps 200,500,1000,2000 -minPts 5 -w \ + -j -filterPETs + + 4. call loops for high-resolution Hi-C like data + cLoops2 callLoops -d hic -o hic -eps 2000,5000,10000 -minPts 20,50 -w -j + + 5. call inter-chromosomal loops (for most data, there will be no significant + inter-chromosomal loops) + cLoops2 callLoops -d HiC -eps 5000 -minPts 10,20,50,100,200 -w -j -trans\ + -o HiC_trans + + +optional arguments: + -h, --help show this help message and exit + -d PREDIR Assign data directory generated by cLoops2 pre to carry out analysis. + -o FNOUT Output data directory / file name prefix, default is cLoops2_output. + -p CPU CPUs used to run the job, default is 1, set -1 to use all CPUs + available. Too many CPU could cause out-of-memory problem if there are + too many PETs. + -cut CUT Distance cutoff to filter cis PETs, only keep PETs with distance + >=cut. Default is 0, no filtering. + -mcut MCUT Keep the PETs with distance <=mcut. Default is -1, no filtering. + -v Show cLoops2 verison number and exit. + --- Following are sub-commands specific options. This option just show + version of cLoops2. + -eps EPS Distance that defines two points (PETs) being neighbors, eps in + blockDBSCAN as key parameter, multiple eps can be assigned such as + 200,500,1000,2000 to run multiple clusterings, the results will be + combined. No default value, please give the input. + -minPts MINPTS Points required in a cluster. minPts in blockDBSCAN is a key parameter. + Empirically 5 is good for TFs and histone modification ChIA-PET data + and Trac-looping. For data like HiChIP and Hi-C, set it larger, like + >=20. The input can be a series, and the final loops will have the + PETs>= max(minPts). + -plot Whether to plot estimated inter-ligation and self-ligation PETs + distance distribution. Default is not to generate a plot. + -i Whether to convert loops to UCSC Interact track to visualize in UCSC. + Default is not, set this flag to save. + -j Whether to convert loops to 2D feature annotations to visualize in + Juicebox. Default is not, set this flag to save. + -w Whether to save tracks of loops to visualize in legacy and new washU. + Default is not, set this flag to save two files. + -max_cut When running cLoops with multiple eps or minPts, multiple distance + cutoffs for self-ligation and inter-ligation PETs will be estimated + based on the overlaps of anchors. Default option is the minimal one + will be used to filter PETs for candidate loop significance test. + Set this flag to use maximal one, will speed up for significance test. + -hic Whether to use statistical cutoffs for Hi-C to output significant loops. + Default is not, set this option to enable. Additionally, with -hic + option, there is no check for anchors requiring they looking like peaks. + -filter Whether to filter raw PETs according to called loops. The filtered + PETs can show clear view of interactions or be used to call loops again. + -trans Whether to call trans- (inter-chromosomal) loops. Default is not, set + this flag to call. For most common cases, not recommended, only for + data there are obvious visible trans loops. + -emPair By default eps and minPts combinations will be used to run clustering. + With this option, for example eps=500,1000 and minPts=5,10, only (500,5) + and (1000,10) as parameters of clustering will be run. Input number of + eps and minPts should be same. + +``` + +------ +### 12. Call differentially enriched intra-chromosomal loops +Run **cLoops2 callDiffLoops -h** to see details. +``` +Call differentially enriched intra-chromosomal loops between two conditions. + +Similar to calling peaks with control data, calling differentially enriched +loops is based on scaled PETs and the Poisson test. There are three main steps +in the algorithm: 1) merge the overlapped loops, quantify them and their +permutated local background regions; 2) fit the linear transformation of +background target interaction density to control background data based on +MANorm2; therefore, if there are more than than two samples, others can be +scaled to the reference sample for quantitative comparison; 3) estimate the +fold change (M) cutoff and average (A) cutoff using the background data with +the control of FDR, assuming there should no differentially significant +interactions called from the background data; or using the assigned cutoffs; 4) +estimate the significance based on the Poisson test for transformed data, both +for the loop and loop anchors. For example, if transformed PETs for target is +5, PETs for control is 3 while control nearby permutated background median is +4, then for the Poisson test, lambda=4-1 is used to test the observed 5 to call +p-value. + +Example: + 1. classical usage + cLoops2 callDiffLoops -tloop target_loop.txt -cloop control_loop.txt \ + -td ./target -cd ./control -o target_diff + + 2. customize MA cutoffs + cLoops2 callDiffLoops -tloop target_loop.txt -cloop control_loop.txt \ + -td ./target -cd ./control -o target_diff -cutomize \ + -acut 5 -mcut 0.5 + + +optional arguments: + -h, --help show this help message and exit + -d PREDIR Assign data directory generated by cLoops2 pre to carry out analysis. + -o FNOUT Output data directory / file name prefix, default is cLoops2_output. + -p CPU CPUs used to run the job, default is 1, set -1 to use all CPUs available. Too many CPU could cause out-of-memory problem if there are too many PETs. - -cut CUT Distance cutoff to filter cis PETs, only keep PETs with distance + -cut CUT Distance cutoff to filter cis PETs, only keep PETs with distance >=cut. Default is 0, no filtering. - -mcut MCUT Keep the PETs with distance <=mcut. Default is -1, no filtering. - -v Show cLoops2 verison number and exit. - --- Following are sub-commands specific options. This option just show + -mcut MCUT Keep the PETs with distance <=mcut. Default is -1, no filtering. + -v Show cLoops2 verison number and exit. + --- Following are sub-commands specific options. This option just show version of cLoops2. - -loops FLOOP The _loop.txt file generated by cLoops2 callLoops or callDiffLoops. - -gtf GTF GTF file annotation for genes. - -tid Whether to use transcript id instead of gene id for annotation. Default + -tloop TLOOP The target loops in _loop.txt file called by cLoops2. + -cloop CLOOP The control loops in _loop.txt file called by cLoops2. + -td TPRED The data directory generated by cLoops2 for target data. + -cd CPRED The data directory generated by cLoops2 for control data. + -pcut PCUT Poisson p-value cutoff to determine significant differentially + enriched loops after Bonferroni correction , default is 1e-2. + -igp Ignore Poisson p-value cutoff and only using FDR to control MA plot + cutoffs. + -noPCorr Do not performe Bonferroni correction of Poisson p-values. Will get + more loops. Default is always performing. + -fdr FDR FDR cutoff for estimating fold change (M) and average value (A) after + normalization with background data. Default is 0.1. + -j Whether to convert loops to 2D feature annotations to visualize in + Juicebox. Default is not, set this flag to save. + -w Whether to save tracks of loops to visualize in legacy and new washU. + Default is not, set this flag to save two files. + -customize Whether to use cutomized cutoffs of MA plot. Defulat is not. If enable + -acut and -mcut is needed. + -cacut CACUT Average cutoff for MA plot of normalized PETs of loops. Assign when + -customize option used. + -cmcut CMCUT Fold change cutoff for MA plot of normalized PETs of loops. Assign when + -customize option used. + -vmin VMIN The minimum value shown in the heatmap and colorbar. + -vmax VMAX The maxmum value shown in the heatmap and colorbar. + -cmap {summer,red,div,cool} + The heatmap style. Default is summer. + + +``` + +------ +### 13. Call domains +Run **cLoops2 callDomains -h** to see details. +``` +Call domains for the 3D genomic data based on correlation matrix and local +segregation score. + +Well tested work for Hi-TrAC/Trac-looping data. + +Examples: + 1. call Hi-C like TADs + cLoops2 callDomains -d trac -o trac -bs 5000,10000 -ws 500000 -p 20 + + 2. call Hi-TrAC/Trac-looping specific small domains + cLoops2 callDomains -d trac -o trac -bs 1000 -ws 100000 -p 20 + + 3. call domains for Hi-C + cLoops2 callDomains -d hic -o hic -bs 10000 -ws 500000 -hic + +optional arguments: + -h, --help show this help message and exit + -d PREDIR Assign data directory generated by cLoops2 pre to carry out analysis. + -o FNOUT Output data directory / file name prefix, default is cLoops2_output. + -p CPU CPUs used to run the job, default is 1, set -1 to use all CPUs + available. Too many CPU could cause out-of-memory problem if there are + too many PETs. + -cut CUT Distance cutoff to filter cis PETs, only keep PETs with distance + >=cut. Default is 0, no filtering. + -mcut MCUT Keep the PETs with distance <=mcut. Default is -1, no filtering. + -v Show cLoops2 verison number and exit. + --- Following are sub-commands specific options. This option just show + version of cLoops2. + -bs BINSIZE Candidate contact matrix resolution (bin size) to call domains. A + series of values or a single value can be used as input. Default is + 10000. If given multiple values, callDomains will try to call nested + domains. Samll value may lead to samller domains. + -ws WINSIZE The half of the sliding window size used to caculate local correlation, + Default is 500000 (500kb). Larger value may lead to larger domains. + -hic Whether to use cutoffs for Hi-C to output significant domains. + Default is not. Set this option to enable, cutoffs will be more loose. +``` + +------ +### 14. Plot the interaction as heatmap/scatter/arches, 1D signals, peaks, loops and domains +Run **cLoops2 plot -h** to see details. +``` +Plot the interaction data as a heatmap (or arches/scatter) with additional of +virtual 4C view point, 1D tracks (bigWig files), 1D annotations (peaks, genes) +and 2D annotations (domains). If -f is not assigned, will just plot profiles +from bigWig file or bed files. + +Examples: + 1. plot the simple square heatmap for a specific region with 1kb resolution + with genes + cLoops2 plot -f test/chr21-chr21.ixy -o test -bs 1000 -start 34840000 \ + -end 34895000 -log -gtf test.gtf + + 2. plot the upper triangle heatmap with domains such as TAD and CTCF bigWig + track + cLoops2 plot -f test/chr21-chr21.ixy -o test_domain -bs 10000 \ + -start 34600000 -end 35500000 -domains HiC_TAD.bed -log \ + -triu -bws GM12878_CTCF_chr21.bw + + 3. plot the heatmap as upper triangle with 1D signal track and filter the + PETs shorter than 1kb + cLoops2 plot -f test/chr21-chr21.ixy -o test -bs 500 -start 34840000 \ + -end 34895000 -log -triu -1D -cut 1000 + + 4. plot the observation/expectation interaction heatmap with 1D signal + cLoops2 plot -f test/chr21-chr21.ixy -o test -m obs/exp -1D -triu \ + -bs 500 -start 34840000 -end 34895000 + + 5. plot the chromosome-wide correlation heatmap + cLoops2 plot -f test/chr21-chr21.ixy -o test -corr + + 6. plot upper triangle interaction heatmap together with genes, bigWig + files, peaks, loops, domains, control the heatmap scale + cLoops2 plot -f test/chr21-chr21.ixy -o test -bs 500 -start 34840000 \ + -end 34895000 -triu -bws ATAC.bw,CTCF.bw -1D \ + -loop test_loops.txt -beds Enh.bed,Tss.bed \ + -domains tad.bed -m obs -log -vmin 0.2 -vmax 2 -gtf genes.gtf + + 7. plot small regions interacting PETs as arches + cLoops2 plot -f test/chr21-chr21.ixy -o test -start 46228500 \ + -end 46290000 -1D -loops gm_loops.txt -arch -aw 0.05 + + 8. plot small regions interacting PETs as scatter plot + cLoops2 plot -f test/chr21-chr21.ixy -o test -start 46228500 \ + -end 46290000 -1D -loops gm_loops.txt -scatter + + 9. plot Hi-C compartments and eigenvector + cLoops2 plot -f test/chr21-chr21.ixy -o test -bs 100000 -log -corr -eig + +optional arguments: + -h, --help show this help message and exit + -d PREDIR Assign data directory generated by cLoops2 pre to carry out analysis. + -o FNOUT Output data directory / file name prefix, default is cLoops2_output. + -p CPU CPUs used to run the job, default is 1, set -1 to use all CPUs + available. Too many CPU could cause out-of-memory problem if there are + too many PETs. + -cut CUT Distance cutoff to filter cis PETs, only keep PETs with distance + >=cut. Default is 0, no filtering. + -mcut MCUT Keep the PETs with distance <=mcut. Default is -1, no filtering. + -v Show cLoops2 verison number and exit. + --- Following are sub-commands specific options. This option just show + version of cLoops2. + -f FIXY Input .ixy file generated by cLoops2 pre. If not assigned, no heatmaps + or arches will be shown and -chrom is needed to generate plots similar + to IGV or other browser. + -bs BINSIZE Bin size/matrix resolution (bp) to generate the contact matrix for + plotting, default is 5000 bp. + -chrom CHROM Chromosome for the target region if -f is not assigned. + -start START Start genomic coordinate for the target region. Default is 0. + -end END End genomic coordinate for the target region. Default is to infer + from the data. + -loops FLOOP The _loop.txt file generated by cLoops2, will be used to plot loops as + arches. + -loopCut LOOPCUT Only show loops with more than loopCut PETs. Default is 0. + -domains FDOMAIN The domains need to annotated in the heatmap such as TADs, should be + .bed file. + -beds BEDS BED tracks of genomic features to plot above the heatmap, such as + promoters and enhancers, track name will be inferred from file name, + for example enhancer.bed,promoter.bed. + -gtf GTF GTF track of genes to plot above the heatmap. + -bws BWS BigWig tracks to plot above the heatmap, track name will be inferred + from file name, for example a.bw,b.bw,c.bw. + -bwvs BWVS BigWig tracks y-axis limitations. Default is atuo-determined. Assign + as 'vmin,vmax;vmin,vmax;vmin,vmax'. For example, '0,1;;0,1' for three + bigWig tracks, as the second track kept atuo-determined. Due to + argparse limitation for parsing minus value, also can be assigned as + vmax,vmin. + -bwcs BWCS BigWig tracks colors. Default is atuo-determined. Assign as + 0,1,2 for three bigWig tracks. Values seperated by comma. + -log Whether to log transform the matrix. + -m {obs,obs/exp} The type of matrix to plot, observed matrix or observed/expected + matrix, expected matrix will be generated by shuffling PETs, default + is observed. + -corr Whether to plot the correlation matrix. Default is not. Correlation + heatmap will use dark mode color map, used together with obs method. + -norm Whether to normalize the matrix with z-score. + -triu Whether to rotate the heatmap only show upper triangle, default is + False. + -vmin VMIN The minimum value shown in the heatmap and colorbar. + -vmax VMAX The maxmum value shown in the heatmap and colorbar. + -1D Whether to plot the pileup 1D signal for the region. Default is not. + Please note, the 1D signal is aggregated from the visualization region. + If want to check the signal from each position of all genome/chromosome, + use cLoops2 dump -bdg to get the bigWig file. + -1Dv ONEDV 1D profile y-axis limitations. Default is auto-determined. Assign as + vmin,vmax, for example 0,1. + -virtual4C Whether to plot the virtual 4C view point 1D signal. Default is not. + If assigned, -view_start and -view_end are needed. + -view_start VIEWSTART + Start genomic coordinate for the view point start region, only valid + when -vitrutal4C is set, should >=start and <=end. + -view_end VIEWEND End genomic coordinate for the view point end region, only valid + when -vitrutal4C is set, should >=start and <=end. + -4Cv VIEWV Virtual 4C profile y-axis limitations. Default is auto-determined. + Assign as vmin,vmax, for example 0,1. + -arch Whether to plot interacting PETs as arches. Default is not. If + set, only original one PET one arch will be shown. Usefule to check + small region for raw data, especially when heatmap is not clear. + -aw AW Line width for each PET in arches plot. Default is 1. Try to + change it if too many or few PETs. + -ac AC Line color for each PET in arches plot. Default is 4. Try to + change it see how many colors are supported by cLoops2. + -aa AA Alpha to control arch color saturation. Default is 1. + -scatter Whether to plot interacting PETs as scatter dots. Default is not. + If set, only original one PET one dot will be shown. Usefule to check + raw data, especially when heatmap is not clear that -vmax is too small. + -ss SS Dot size for each PET in scatter plot. Default is 1. Try to + change it to optimize the plot. + -sc SC Dot color for each PET in scatter plot. Default is 0. Try to + change it see how many colors are supported by cLoops2. + -sa SA Alpha to control dot color saturation. Default is 1. + -eig Whether to plot the PC1 of correlation matirx to show compartments + Default is not. Only work well for big regions such as resolution + of 100k. + -eig_r Whether to flip the PC1 values of -eig. It should be dependend on + inactivate or activate histone markers, as actually the PCA values do + not have directions, especially comparing different samples. + -figWidth {4,8} Figure width. 4 is good to show the plot as half of a A4 figure + width and 8 is good to show more wider. Default is 4. + + +``` + +------ +### 15. Montage analysis for regions of interactions +Run **cLoops2 montage -h** to see details. +``` +Montage analysis of specific regions, producing Westworld Season 3 -like +Rehoboam plot. + +Examples: + 1. showing all PETs for a gene's promoter and enhancers + cLoops2 montage -f test/chr21-chr21.ixy -bed test.bed -o test + + 2. showing simplified PETs for a gene's promoter and enhancers + cLoops2 montage -f test/chr21-chr21.ixy -bed test.bed -o test -simple + + 3. adjust interacting link width + cLoops2 montage -f test/chr21-chr21.ixy -bed test.bed -o test -simple \ + -ppmw 10 + + 4. showing all PETs for a region, if in the bed file only contains one region + cLoops2 montage -f test/chr21-chr21.ixy -bed test.bed -o test -ext 0 + + +optional arguments: + -h, --help show this help message and exit + -d PREDIR Assign data directory generated by cLoops2 pre to carry out analysis. + -o FNOUT Output data directory / file name prefix, default is cLoops2_output. + -p CPU CPUs used to run the job, default is 1, set -1 to use all CPUs + available. Too many CPU could cause out-of-memory problem if there are + too many PETs. + -cut CUT Distance cutoff to filter cis PETs, only keep PETs with distance + >=cut. Default is 0, no filtering. + -mcut MCUT Keep the PETs with distance <=mcut. Default is -1, no filtering. + -v Show cLoops2 verison number and exit. + --- Following are sub-commands specific options. This option just show + version of cLoops2. + -f FIXY Input .ixy file generated by cLoops2 pre. + -bed BED Input .bed file for target regions, 4th columns should be id/name for + the region. + -ext EXT Up-stream and down-stream extesion of target region length. Default is + 2. If the input bed already include up/down-stream regions, assign as 0. + -simple Whether to only draw the representative interactions between two target + regions as one arch, and not include the interactions in extended + regions. Default is not, all interactions will be shown as archs.. + -vp VIEWPOINT Only show interactions with specific regions from all other regions. + Name/id (4th column in .bed file) is need. Default is to show all + releated interactions. Multiple names/ids can be assigned by seperation + of comma. + -vmin VMIN The minial scale for 1D pileup data. Default will be inferred from the + data. + -vmax VMAX The maxmial scale for 1D pileup data. Default will be inferred from the + data. + -ppmw PPMW Link line width indicator, short for 1 PETs per Million PETs line + width, default is 10. Adjust this value when -simple is used. Decrease + it if links are too bold and increase it when links are too thin. + -aw AW Line width for each PET if -simple is not selected. Default is 1. + -no1D Whether to not plot 1D profiles. Default is plot. Set this for Hi-C + like data. +``` + +------ +### 16. Aggregation analysis for peaks, loops and domains +Run **cLoops2 agg -h** to see details. +``` +Do the aggregation analysis for peaks, loops, view points and domains. + +The output figures can be used directly, and the data to generate the plot are +also saved for further customized analysis. + +For the aggregated peaks analysis,input is a .bed file annotated with the +coordinates for the target regions/peaks/anchors. Output is a .pdf file +containing a mean density plot and heatmap and a .txt file for the data. The +data in the .txt file and plot were normalized to RPM (reads per million). + +For the aggregated view points analysis, input is a .bed file annotated with +coordinates for the target regions/peaks/anchors as view point. Output is a +.pdf file containing a mean density plot and heatmap and a .txt file for the +data. The data in the .txt file and plot were normalized to +log2( RPM (reads per million)+1). + +For the aggregated loops analysis, input is a _loops.txt file annotated with +the coordinates for target loops, similar to the format of BEDPE. Output is a +.pdf file for mean heatmap and .npz file generated through numpy.savez for all +loops and nearby regions matrix. The enrichment score (ES) in the plot is +calculated as: ES = mean( (PETs in loop)/(mean PETs of nearby regions) ). Other +files except _loops.txt can be used as input, as long as the file contains key +information in the first columns separated by tabs: +loopId chrA startA endA chrB startB endB distance +loop-1 chr21 1000 2000 chr21 8000 9000 7000 + +There is another option for loops analysis, termed as two anchors. Input file is +same to aggregated loops analysis. The whole region with assigned extesion +between two anchors will be aggregated and 1D profile can show two anchors. The +analysis could be usefule to study/comapre different classes of anchors and +combinations, for example, considering CTCT motif directions, all left anchors +CTCF motifs are in positive strand and in negative strand for all right anchors. +It could be interesting for some loops one anchor only bound by transcription +factor a and another anchor only bound by transcription b. + +For the aggregated domains analysis, input is a .bed file annotated with the +coordinates for the domains, such as TADs. Output are a .pdf file for the upper +triangular heatmap and .npz file generated through numpy.savez for all domains +and nearby region matrix. The enrichment score (ES) in the plot is calculated +as mean( (two ends both with in domain PETs number)/( only one end in domain +PETs number) ). + +Examples: + 1. show aggregated peaks heatmap and profile + cLoops2 agg -d test -peaks peaks.bed -o test -peak_ext 2500 \ + -peak_bins 200 -peak_norm -skipZeros + + 2. show aggregated view points and aggregated bigWig signal + cLoops2 agg -d test -o test -viewPoints test_peaks.bed -bws CTCF.bw + + 3. show aggregated loops heatmap, 1D profile and aggregated bigWig signal + cLoops2 agg -d test -o test -loops test_loops.txt -bws CTCF.bw -1D \ + -loop_norm + + 3. show aggregated loops heatmap, 1D profile and aggregated bigWig signal + in two anchors mode + cLoops2 agg -d test -o test -twoAnchors test_loops.txt -bws CTCF.bw -1D \ + -loop_norm + + 4. show aggregated domains heatmap, 1D profile and aggregated bigWig signal + cLoops2 agg -d test -o test -domains TAD.bed -bws CTCF.bw -1D + + +optional arguments: + -h, --help show this help message and exit + -d PREDIR Assign data directory generated by cLoops2 pre to carry out analysis. + -o FNOUT Output data directory / file name prefix, default is cLoops2_output. + -p CPU CPUs used to run the job, default is 1, set -1 to use all CPUs + available. Too many CPU could cause out-of-memory problem if there are + too many PETs. + -cut CUT Distance cutoff to filter cis PETs, only keep PETs with distance + >=cut. Default is 0, no filtering. + -mcut MCUT Keep the PETs with distance <=mcut. Default is -1, no filtering. + -v Show cLoops2 verison number and exit. + --- Following are sub-commands specific options. This option just show + version of cLoops2. + -peaks PEAKF The .bed file for peaks-centric aggregation analysis. + -peak_ext PEAK_EXT The nearby upstream and downstream regions (bp) from the peak center. + Default is 5000. + -peak_bins PEAK_BINS The bin size for the profile array of peaks. Default is 100. + -peak_norm Whether to normalize the data in the peaks profile plot and + heatmap with row-wise z-score. Default is not. + -viewPoints VIEWPOINTF + The .bed file for view points -centric aggregation analysis. + -viewPointUp VIEWPOINTUP + The upstream regions included for the aggreaged view points analysis. + Default is 100000 bp. + -viewPointDown VIEWPOINTDOWN + The downstream regions included for the aggreaged view points analysis. + Default is 100000 bp. + -viewPointBs VIEWPOINTBS + Contact matrix bin size for view points heatmap. Default is 1000 bp. + -viewPoint_norm Whether to normalize the sub-matrix for each loop as divide the mean + PETs for the matrix. Default is not. + -loops LOOPF The _loop.txt file generated by cLoops2 for loops-centric + aggregation analysis. The file first 8 columns are necessary. + -loop_ext LOOP_EXT The nearby regions included to plot in the heatmap and calculation of + enrichment for aggregation loop analysis, default is 10, should be + even number. + -loop_cut LOOP_CUT Distance cutoff for loops to filter. Default is 0. + -loop_norm Whether to normalize the sub-matrix for each loop as divide the mean + PETs for the matrix (except the loop region). Default is not. + -twoAnchors TWOANCHORSF + The similar _loop.txt file generated by cLoops2 for two anchors + aggregation analysis. The file first 8 columns are necessary. + -twoAnchor_ext TWOANCHOR_EXT + The nearby regions of fold included to plot in heatmap. + Default is 0.1. + -twoAnchor_vmin TWOANCHOR_VMIN + The minimum value shown in the domain heatmap and colorbar. + -twoAnchor_vmax TWOANCHOR_VMAX + The maxmum value shown in the domain heatmap and colorbar. + -domains DOMAINF The .bed file annotated the domains such as TADs for aggregated + domains-centric analysis. + -domain_ext DOMAIN_EXT + The nearby regions of fold included to plot in heatmap and + caculation of enrichment, default is 0.5. + -domain_vmin DOMAIN_VMIN + The minimum value shown in the domain heatmap and colorbar. + -domain_vmax DOMAIN_VMAX + The maxmum value shown in the domain heatmap and colorbar. + -1D Whether to plot the pileup 1D signal for aggregated loops, + aggregated view points or aggregated domains. Default is not. + -bws BWS BigWig tracks to plot above the aggregated loops heatmap (or under + the aggregated domains heatmap), track name will be inferred from file + name, for example a.bw,b.bw,c.bw. + -skipZeros Whether to remove all 0 records. Default is not. + +``` + +------ +### 17. Quantification of peaks, loops and domains +Run **cLoops2 quant -h** to see details. +``` +Quantify the peaks, loops and domains. The output file will be the same as +outputs of callPeaks, callLoops and callDomains. + +Examples: + 1. quantify peaks + cLoops2 quant -d test -peaks peaks.bed -o test + + 2. quantify loops + cLoops2 quant -d test -loops test_loops.txt -o test + + 3. quantify domains + cLoops2 quant -d test -domains test_domains.txt -o test + +optional arguments: + -h, --help show this help message and exit + -d PREDIR Assign data directory generated by cLoops2 pre to carry out analysis. + -o FNOUT Output data directory / file name prefix, default is cLoops2_output. + -p CPU CPUs used to run the job, default is 1, set -1 to use all CPUs + available. Too many CPU could cause out-of-memory problem if there are + too many PETs. + -cut CUT Distance cutoff to filter cis PETs, only keep PETs with distance + >=cut. Default is 0, no filtering. + -mcut MCUT Keep the PETs with distance <=mcut. Default is -1, no filtering. + -v Show cLoops2 verison number and exit. + --- Following are sub-commands specific options. This option just show + version of cLoops2. + -peaks PEAKF The .bed file for peaks-centric quantification. + -loops LOOPF The _loop.txt file generated by cLoops2 for loops-centric + quantification, as long as there are first 8 columns. + -domains DOMAINF The _domains.txt file generated by cLoops2 for domains-centric + quantification, as long as there are first 3 columns + -domain_bs DOMAINBINSIZE + Candidate contact matrix resolution (bin size) to quantify domains, + default is 10000. Only one integer is supported. + -domain_ws DOMAINWINSIZE + The half window size used to calculate local correlation to quantify + domains. Default is 500000 (500kb). + -domain_bdg Whether to save the segregation score ad bedGraph file, default. is not. - -pdis PDIS Distance limitation for anchor to nearest gene/transcript TSS to define - as promoter. Default is 2000 bp. - -net Whether to use network method to find all enhancer/promoter links based - on loops. Default is not. In this mode, overlapped anchors will be - merged and annotated as enhancer/promoter, then for a gene, all linked - node will be output. - -gap GAP When -net is set, the distance for close anchors to merge. Default is 1. - - ``` - - ------ - ### 19. Find target genes of genomic regions with cLoops2 anaLoops output - Run **cLoops2 findTargets -h** to see details. - ``` - Find target genes of genomic regions (peaks, SNPs) through enhancer-promoter - networks. Output from cLoops2 anaLoops with suffix of _ep_net.sif and - _targets.txt are needed. - - Examples: - 1. find target genes of peaks/SNPs - cLoops2 findTargets -net test_ep_net.sif -tg test_targets.txt \ - -bed GWAS.bed -o test - - optional arguments: - -h, --help show this help message and exit - -d PREDIR Assign data directory generated by cLoops2 pre to carry out analysis. - -o FNOUT Output data directory / file name prefix, default is cLoops2_output. - -p CPU CPUs used to run the job, default is 1, set -1 to use all CPUs - available. Too many CPU could cause out-of-memory problem if there are - too many PETs. - -cut CUT Distance cutoff to filter cis PETs, only keep PETs with distance - >=cut. Default is 0, no filtering. - -mcut MCUT Keep the PETs with distance <=mcut. Default is -1, no filtering. - -v Show cLoops2 verison number and exit. - --- Following are sub-commands specific options. This option just show - version of cLoops2. - -net FNET The _ep_net.sif file generated by cLoops2 anaLoops. - -tg FTG The _targets.txt file generated by cLoops2 anaLoops. - -bed FBED Find target genes for regions, such as anchors, SNPs or peaks. - - ``` - - ------ - ------ - ## Extended Analysis Application Scripts - The following analysis application scripts are available when cLoops2 is installed. The majority of them can be independently run. The -h option can show example usages and details of parameters. Some of them will be integrated into cLoops sub-programmes if well tested and frequently used. More will be added. - - ### File Format Conversion - - [hicpro2bedpe.py](https://github.com/YaqiangCao/cLoops2/blob/master/scripts/hicpro2bedpe.py) : convert HiC-Pro output allValidPairs file to BEDPE file as input of cLoops2. - - [juicerLong2bedpe.py](https://github.com/YaqiangCao/cLoops2/blob/master/scripts/juicerLong2bedpe.py): convert Juicer output long format interaction file to BEDPE file as input of cLoops2. - - [getBedpeFBed.py](https://github.com/YaqiangCao/cLoops2/blob/master/scripts/getBedpeFBed.py): convert single-end reads in BED format to paired-end reads in BEDPE format with expected fragment size as input of cLoops2 to call peaks. - - --- - ### Analysis without plot - - [getDI.py](https://github.com/YaqiangCao/cLoops2/blob/master/scripts/getDI.py): calculate the [Directionality Index](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3356448/) as , where **x** is the bin and **A** is the interaction reads within the region from specific upstream to bin **x**, and **B** is the downstream reads. - - - [getFRiF.py](https://github.com/YaqiangCao/cLoops2/blob/master/scripts/getFRiF.py): calculate the **F**raction of **R**eads **i**n **F**eatures (FRiF), the features could be domains and peaks annotated with .bed file or domains/stripes/loops with .txt file such as the \_loop.txt file. - - - [getIS.py](https://github.com/YaqiangCao/cLoops2/blob/master/scripts/getIS.py): calculate the [insulation score](https://www.nature.com/articles/nature20158) with a little modification for the data with output of a bedGraph file, the math formula used is , where ***x*** is the genomic location, which can be bins or exact base pair, ***I(x-s,x+s)*** is the interactions/PETs observed in the region from ***x-s*** to ***x+s***, and ***s*** should be set a little large, such as 100kb to observe a good fit for the insulation score and TAD boundaries. - - - [getLocalIDS.py](https://github.com/YaqiangCao/cLoops2/blob/master/scripts/getcLocalIDS.py): calculate the local interaction density score for the data with output a bedGraph file, the math formula used is , where ***x*** is the genomic location for the target bin, ***N*** is the total PETs in the target chromosomal, ***I(x,x_i)*** is the observed PETs linking the region bin ***x*** and the ith nearby bin of the same size. - - - [getPETsAno.py](https://github.com/YaqiangCao/cLoops2/blob/master/scripts/getPETsAno.py): get the PETs ratio of enhancer-promoter, enhancer-enhancer, promoter-promoter, enhancer-none, promoter-none, none-none interactions. - - - [tracPre.py](https://github.com/YaqiangCao/cLoops2/blob/master/scripts/tracPre.py): pre-process the raw reads of FASTQ files of Trac-looping data to the reference genome and obtain the unique PETs with quality control results. - - - [tracPre2.py](https://github.com/YaqiangCao/cLoops2/blob/master/scripts/tracPre2.py): pre-process the raw reads of FASTQ files of Hi-TrAC data to the reference genome and obtain the unique PETs with quality control results. - - ----- - ------ - ## Input, Intermediate, Output Files - - [.bedpe](#.bedpe) - - [.ixy](#.ixy) - - [_peaks.txt](#_peaks.txt) - - [_loops.txt](#_loops.txt) - - [_dloops.txt](#_dloops.txt) - - [_domains.txt](#_domains.txt) - - ---- - - ### Input .bedpe file - Mapped PETs in [BEDPE format](http://bedtools.readthedocs.io/en/latest/content/general-usage.html), compressed files with gzip are also accepted, following columns are necessary: chrom1 (1st),start1 (2),end1 (3),chrom2 (4),start2 (5),end2 (6),strand1 (9),strand2 (10). For the column of name or score, "." is accepted. Columns are separated by "\t". - For example as following: - ``` - chr1 9945 10095 chr1 248946216 248946366 . . + + - chr1 10034 10184 chr1 180987 181137 . . + - - chr1 10286 10436 chr1 181103 181253 . . + - - chr1 10286 10436 chr11 181103 181253 . . + - - chr11 10286 10436 chr1 181103 181253 . . + - - ... - ``` - - ------ - - ### Intermediate .ixy file - numpy.array of (x,y) saved to [joblib.dump](https://joblib.readthedocs.io/en/latest/generated/joblib.dump.html) for fast access of the interaction EPTs and contact matrix at any resolution, nearly all cLoops2 related analysis are based on this file type. - ``` - 10099025 10099048 - 39943889 39943890 - 18391007 18391853 - 35502951 35502951 - 10061555 10061557 - ... - ``` - - ------ - - ### Output \_peaks.txt file - column | name | explanation - ------ | ---- | ------------ - 0th | peakId | id for a peak, for example peak\_chr1-chr1-1 - 1th | chrom | chromosomal for the peak - 2th | start | genomic coordinate of the start site - 3th | end | genomic coordinate of the end site - 4th | summit | genomic coordinate of peak summit - 5th | length | length of the peak - 6th | counts | observed reads number in the peak - 7th | RPKM | RPKM for the reads density in the peak - 8th | enrichmentScore | enrichment score for the peak, calculated by observed PETs number divided by the mean PETs number of nearby 10 fold and 20 fold regions - 9th | poissonPvalue | Poisson test p-value for the loop after Bonferroni correction - 10th | controlCounts| if control data such as input/IgG is assigned, the observed reads number in peak region for control data - 11th | controlRPKM | if control data assigned, RPKM for the reads density in the peak region for control data - 12th | controlScaledCount | if control data assigned, the scaled expected counts used for Poisson test/enrichment score against control data - 13th | enrichmentScoreVsControl | if control data assigned, enrichment score of target vs. control - 14th | poissonPvalueVsControl | if control data assigned, Poisson test p-value of target vs. control after Bonferroni correction - 15th | significant | 1 or 0, 1 means we think the peak is significant compared to local background and control (if assigned) - - ------ - - ### Output \_loops.txt file - column | name | explanation - ------ | ---- | ------------ - 0th | loopId | id for a loop, for example loop\_chr1-chr1-1 - 1th | chromA | chromosomal for the loop first anchor - 2th | startA | genomic coordinate of the start site for the first anchor - 3th | endA | genomic coordinate of the end site for the first anchor - 4th | chromB | chromosomal for the loop second anchor - 5th | startB | genomic coordinate of the start site for the second anchor - 6th | endB | genomic coordinate of the end site for the second anchor - 7th | distance | distance (bp) between the centers of the anchors for the loop - 8th | centerA | genomic coordinate of the center site for the first anchor - 9th | centerB | genomic coordinate of the center site for the second anchor - 10th | readsA | observed PETs number for the first anchor - 11th | readsB | observed PETs number for the second anchor - 12th | cis | whether the loop is a intra-chromosomal loop (cis) - 13th | PETs | observed PETs number linking the two anchors - 14th | density | similarly to that of RPKM (reads per kilobase per million): - 15th | enrichmentScore | enrichment score for the loop, calculated by observed PETs number divided by the mean PETs number of nearby permutated regions - 16th | P2LL | peak to the lower left, calculated similar to that of Juicer - 17th | FDR | false discovery rate for the loop, calculated as the number of permutated regions that there are more observed PETs than the region - 18th | binomalPvalue | binomal test p-value for the loop, updated caculation, different from cLoops - 19th | hypergeometricPvalue | hypergeometric test p-value for the loop - 20th | poissonPvalue | Poisson test p-value for the loop - 21th | xPeakpoissonPvalue | Poisson test p-value for the left anchor potential peak p-value - 22th | yPeakpoissonPvalue | Poisson test p-value for the right anchor potential peak p-value - 23th | significant | 1 or 0, 1 means we think the loop is significant compared to permutated regions. In cLoops2, only significant loops are written to the file. - - ------ - - ### Output \_dloops.txt file - column | name | explanation - ------ | ---- | ------------ - 0th | loopId | id for a loop, for example loop\_chr1-chr1-1 - 1th | chromA | chromosomal for the loop first anchor - 2th | startA | genomic coordinate of the start site for the first anchor - 3th | endA | genomic coordinate of the end site for the first anchor - 4th | chromB | chromosomal for the loop second anchor - 5th | startB | genomic coordinate of the start site for the second anchor - 6th | endB | genomic coordinate of the end site for the second anchor - 7th | distance | distance (bp) between the centers of the anchors for the loop - 8th | centerA | genomic coordinate of the center site for the first anchor - 9th | centerB | genomic coordinate of the center site for the second anchor - 10th | rawTargetAnchorAReads | observed PETs number for the first anchor in target sample - 11th | rawTargetAnchorBReads | observed PETs number for the second anchor in target sample - 12th | rawControlAnchorAReads | observed PETs number for the first anchor in control sample - 13th | rawControlAnchorBReads | observed PETs number for the second anchor in control sample - 14th | scaledTargetAnchorAReads | scaled PETs number for the first anchor in target sample - 15th | scaledTargetAnchorBReads | scaled PETs number for the second anchor in target sample - 16th | rawTargetCounts | raw PETs number for the loop in target sample - 17th | scaledTargetCounts | scaled PETs number for the loop in target sample, fitting to control sample - 18th | rawControlCounts | raw PETs number for the loop in control sample - 19th | rawTargetNearbyMedianCounts | raw median PETs number for the loop nearby permutation regions in target sample - 20th | scaledTargetNearbyMedianCounts | scaled median PETs number for the loop nearby permutation regions in target sample, fitting to control sample - 21th | rawControlNearbyMedianCounts | raw median PETs number for the loop nearby permutation regions in control sample - 22th | rawTargetES | target sample rawTargetCounts/rawTargetNearbyMedianCounts - 23th | rawControlES | control sample rawControlCounts/rawControlNearbyMedianCounts - 24th | targetDensity | raw interaction density in target sample, RPKM - 25th | controlDensity | raw interaction density in control sample, RPKM - 26th | rawFc | raw fold change of the interaction density, log2(target/control) - 27th | scaledFc | scaled fold change of PETs, log2( scaledTargetCounts/rawControlCounts ) - 28th | poissonPvalue | possion p-value for the significance test after Bonferroni correction - 29th | significant | 1 or 0, 1 means we think the loop is significant differentlly enriched - - ------ - - ### Output \_domains.txt file - column | name | explanation - ------ | ---- | ------------ - 0th | domainId | id for a domain, for example domain\_0 - 1th | chrom | chromosomal for the loop first anchor - 2th | start | genomic coordinate of the start site for the domain - 3th | end | genomic coordinate of the end site for the domain - 4th | length | length of the domain - 5th | binSize | bin size used for the matrix to call the domain - 6th | winSize | window size used for the matrix to call the domain - 7th | segregationScore | mean segregation score for all bins within the domain - 8th | totalPETs | number of total PETs in the domain - 9th | withinDomainPETs | number of PETs only interacting within the domain - 10th | enrichmentScore | (withinDomainPETs) / (totalPETs-withinDomainPETs) - 11th | density | similarly to that of RPKM (reads per kilobase per million): - - ------ - - ### Output \_loopsGtfAno.txt file - column | name | explanation - ------ | ---- | ------------ - 0th | loopId | loopId from input file - 1th | typeAnchorA | annotated type of anchor a (left anchor), enhancer or promoter - 2th | typeAnchorB | annotated type of anchor b (right anchor) - 3th | nearestDistanceToGeneAnchorA | distance of anchor a to nearest TSS - 4th | nearestDistanceToGeneAnchorB | distance of anchor b to nearest TSS - 5th | nearestTargetGeneAnchorA | anchor a nearest TSS gene, for example chr21:34836286-34884882\|+\|AP000331.1 (named by rules of chrom:start-end\|strand\|geneName). If a promoter overlaps two head-to-head genes, all genes will be reported by seperation of a comma. - 6th | nearestTargetGeneAnchorB | anchor b nearest TSS gene - - ------ - - ### Output \_mergedAnchors.txt file - column | name | explanation - ------ | ---- | ------------ - 0th | anchorId | id for merged anchors. For example, chr21:14025126-14026192\|Promoter (named by the rule of: chrom:start-end\|type) - 1th | chrom | chromosome - 2th | start | start - 3th | end | end - 4th | type | annotated type for the anchor, enhancer or promoter - 5th | nearestDistanceToTSS | distance of anchor a to nearest TSS - 6th | nearestGene | nearest gene name. If a promoter overlaps two head-to-head genes, all genes will be reported by seperation of a comma. - 7th | nearestGeneLoc | neart gene information. For example, chr21:34787801-35049344\|-\|RUNX1 (named by the rule of: chrom:start-end\|strand\|name). If a promoter overlaps two head-to-head genes, all genes will be reported by seperation of a comma. - - ------ - - ### Output \_loop2anchors.txt file - column | name | explanation - ------ | ---- | ------------ - 0th | loopId | loopId from input file - 1th | mergedAnchorA | original anchor a (left anchor) to new merged anchor id - 2th | mergedAnchorB | original anchor b (right anchor) to new merged anchor id - - ------ - - ### Output \_targets.txt file - column | name | explanation - ------ | ---- | ------------ - 0th | promoter | annotated anchors that overlapped or very close to gene's transcription start site. For example, chr21:35043062-35051895\|Promoter (named by the rule of: chrom:start-end\|Promoter). - 1th | PromoterTarget | promoter target genes. If a promoter is shared by multiple genes, all genes will be reported and seperated by comma. For example, chr21:34787801-35049344\|-\|RUNX1 (named by the rule of: chorm:start-end\|strand\|name. - 2th | directEnhancer | enhancers that directly looping with target promoter. Multiple enhancers will be reported and seperated by comma. For example, chr21:35075636-35077527\|Enhancer,chr21:35026356-35028520\|Enhancer,chr21:34801302-34805056\|Enhancer. - 3th | indirectEnhancer | enhancers that indirectly looping with target promoter, by enhancer-enhancer-promoter or enhancer-promoter-promoter. Multiple enhancers will be reported and seperated by comma. - 4th | directPromoter | other promoters directly looping with target promoter. - 5th | indirectPromoter | other promoters indirectly looping with target promoter, by promoter-enhancer-promoter or promoter-promoter-promoter. - 6th | directEnhancerHub | hub of direct enhancer. If there are more than 2 direct enhancers, using HITS algorithm to find the most linked one and report. - 7th | indirectEnhancerHub | hub of indirect enhancer. If there are more than 2 indirect enhancers, using HITS algorithm to find the most linked one and report. - - - -------- - -------- - ## cLoops2 citations - - -------- - -------- - ## cLoops2 updates - - - -Keywords: peak-calling loop-calling Hi-Trac interaction visualization -Platform: UNKNOWN -Classifier: Environment :: Console -Classifier: Operating System :: MacOS :: MacOS X -Classifier: Operating System :: POSIX -Classifier: Topic :: Scientific/Engineering :: Bio-Informatics -Requires-Python: >=3 -Description-Content-Type: text/markdown +``` + +------ +### 18. Annotation of loops to genes +Run **cLoops2 anaLoops -h** to see details. +``` +Annotating loops: +- find the closest TSS for each loop anchors +- merge the loop anchors and classify them as enhancers or promoters based on + distance to nearest TSS +- build the interaction networks for merged anchors +- find the all interacted enhancers/promoters for each promoter + +Basic mode 1: with -gtf, loops will be annotated as enhancer or promoter based +on distance to nearest gene. If a anchor overlapped with two/multiple promoters +(often seen for close head-to-head genes), all will be reported. If no TSS +overlaps, then nearest one will be assigned. + +Basic mode 2: with -gtf -net, overlapped anchors will be merged and annoated as +enhancer or promoter considering distance to genes. For each promoter, all +linked enhancer and promoter will be shown. If there are more than 3 direct or +indirect enhancers for a promoter, HITS algorithm will be used to identify one +hub for indirect enhancer and one hub for indirect enhancer. + +Examples: + 1. annotate loops for target gene, basic mode 1 + cLoops2 anaLoops -loops test_loops.txt -gtf genecode.gtf + + 2. annotate loops for target transcripts (alternative TSS), basic mode 1 + cLoops2 anaLoops -loops test_loops.txt -gtf genecode.gtf -tid + + 3. find a gene's all linked enhancer or promoter, basic mode 2 + cLoops2 anaLoops -loops test_loops.txt -gtf genecode.gtf -net + +optional arguments: + -h, --help show this help message and exit + -d PREDIR Assign data directory generated by cLoops2 pre to carry out analysis. + -o FNOUT Output data directory / file name prefix, default is cLoops2_output. + -p CPU CPUs used to run the job, default is 1, set -1 to use all CPUs + available. Too many CPU could cause out-of-memory problem if there are + too many PETs. + -cut CUT Distance cutoff to filter cis PETs, only keep PETs with distance + >=cut. Default is 0, no filtering. + -mcut MCUT Keep the PETs with distance <=mcut. Default is -1, no filtering. + -v Show cLoops2 verison number and exit. + --- Following are sub-commands specific options. This option just show + version of cLoops2. + -loops FLOOP The _loop.txt file generated by cLoops2 callLoops or callDiffLoops. + -gtf GTF GTF file annotation for genes. + -tid Whether to use transcript id instead of gene id for annotation. Default + is not. + -pdis PDIS Distance limitation for anchor to nearest gene/transcript TSS to define + as promoter. Default is 2000 bp. + -net Whether to use network method to find all enhancer/promoter links based + on loops. Default is not. In this mode, overlapped anchors will be + merged and annotated as enhancer/promoter, then for a gene, all linked + node will be output. + -gap GAP When -net is set, the distance for close anchors to merge. Default is 1. + +``` + +------ +### 19. Find target genes of genomic regions with cLoops2 anaLoops output +Run **cLoops2 findTargets -h** to see details. +``` +Find target genes of genomic regions (peaks, SNPs) through enhancer-promoter +networks. Output from cLoops2 anaLoops with suffix of _ep_net.sif and +_targets.txt are needed. + +Examples: + 1. find target genes of peaks/SNPs + cLoops2 findTargets -net test_ep_net.sif -tg test_targets.txt \ + -bed GWAS.bed -o test + +optional arguments: + -h, --help show this help message and exit + -d PREDIR Assign data directory generated by cLoops2 pre to carry out analysis. + -o FNOUT Output data directory / file name prefix, default is cLoops2_output. + -p CPU CPUs used to run the job, default is 1, set -1 to use all CPUs + available. Too many CPU could cause out-of-memory problem if there are + too many PETs. + -cut CUT Distance cutoff to filter cis PETs, only keep PETs with distance + >=cut. Default is 0, no filtering. + -mcut MCUT Keep the PETs with distance <=mcut. Default is -1, no filtering. + -v Show cLoops2 verison number and exit. + --- Following are sub-commands specific options. This option just show + version of cLoops2. + -net FNET The _ep_net.sif file generated by cLoops2 anaLoops. + -tg FTG The _targets.txt file generated by cLoops2 anaLoops. + -bed FBED Find target genes for regions, such as anchors, SNPs or peaks. + +``` + +------ +------ +## Extended Analysis Application Scripts +The following analysis application scripts are available when cLoops2 is installed. The majority of them can be independently run. The -h option can show example usages and details of parameters. Some of them will be integrated into cLoops sub-programmes if well tested and frequently used. More will be added. + +### File Format Conversion +- [hicpro2bedpe.py](https://github.com/YaqiangCao/cLoops2/blob/master/scripts/hicpro2bedpe.py) : convert HiC-Pro output allValidPairs file to BEDPE file as input of cLoops2. +- [juicerLong2bedpe.py](https://github.com/YaqiangCao/cLoops2/blob/master/scripts/juicerLong2bedpe.py): convert Juicer output long format interaction file to BEDPE file as input of cLoops2. +- [getBedpeFBed.py](https://github.com/YaqiangCao/cLoops2/blob/master/scripts/getBedpeFBed.py): convert single-end reads in BED format to paired-end reads in BEDPE format with expected fragment size as input of cLoops2 to call peaks. + +--- +### Analysis without plot +- [getDI.py](https://github.com/YaqiangCao/cLoops2/blob/master/scripts/getDI.py): calculate the [Directionality Index](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3356448/) as , where **x** is the bin and **A** is the interaction reads within the region from specific upstream to bin **x**, and **B** is the downstream reads. + +- [getFRiF.py](https://github.com/YaqiangCao/cLoops2/blob/master/scripts/getFRiF.py): calculate the **F**raction of **R**eads **i**n **F**eatures (FRiF), the features could be domains and peaks annotated with .bed file or domains/stripes/loops with .txt file such as the \_loop.txt file. + +- [getIS.py](https://github.com/YaqiangCao/cLoops2/blob/master/scripts/getIS.py): calculate the [insulation score](https://www.nature.com/articles/nature20158) with a little modification for the data with output of a bedGraph file, the math formula used is , where ***x*** is the genomic location, which can be bins or exact base pair, ***I(x-s,x+s)*** is the interactions/PETs observed in the region from ***x-s*** to ***x+s***, and ***s*** should be set a little large, such as 100kb to observe a good fit for the insulation score and TAD boundaries. + +- [getLocalIDS.py](https://github.com/YaqiangCao/cLoops2/blob/master/scripts/getcLocalIDS.py): calculate the local interaction density score for the data with output a bedGraph file, the math formula used is , where ***x*** is the genomic location for the target bin, ***N*** is the total PETs in the target chromosomal, ***I(x,x_i)*** is the observed PETs linking the region bin ***x*** and the ith nearby bin of the same size. + +- [getPETsAno.py](https://github.com/YaqiangCao/cLoops2/blob/master/scripts/getPETsAno.py): get the PETs ratio of enhancer-promoter, enhancer-enhancer, promoter-promoter, enhancer-none, promoter-none, none-none interactions. + +- [tracPre.py](https://github.com/YaqiangCao/cLoops2/blob/master/scripts/tracPre.py): pre-process the raw reads of FASTQ files of Trac-looping data to the reference genome and obtain the unique PETs with quality control results. + +- [tracPre2.py](https://github.com/YaqiangCao/cLoops2/blob/master/scripts/tracPre2.py): pre-process the raw reads of FASTQ files of Hi-TrAC data to the reference genome and obtain the unique PETs with quality control results. + +----- +------ +## Input, Intermediate, Output Files +- [.bedpe](#.bedpe) +- [.ixy](#.ixy) +- [_peaks.txt](#_peaks.txt) +- [_loops.txt](#_loops.txt) +- [_dloops.txt](#_dloops.txt) +- [_domains.txt](#_domains.txt) + +---- + +### Input .bedpe file +Mapped PETs in [BEDPE format](http://bedtools.readthedocs.io/en/latest/content/general-usage.html), compressed files with gzip are also accepted, following columns are necessary: chrom1 (1st),start1 (2),end1 (3),chrom2 (4),start2 (5),end2 (6),strand1 (9),strand2 (10). For the column of name or score, "." is accepted. Columns are separated by "\t". +For example as following: +``` +chr1 9945 10095 chr1 248946216 248946366 . . + + +chr1 10034 10184 chr1 180987 181137 . . + - +chr1 10286 10436 chr1 181103 181253 . . + - +chr1 10286 10436 chr11 181103 181253 . . + - +chr11 10286 10436 chr1 181103 181253 . . + - +... +``` + +------ + +### Intermediate .ixy file +numpy.array of (x,y) saved to [joblib.dump](https://joblib.readthedocs.io/en/latest/generated/joblib.dump.html) for fast access of the interaction EPTs and contact matrix at any resolution, nearly all cLoops2 related analysis are based on this file type. +``` +10099025 10099048 +39943889 39943890 +18391007 18391853 +35502951 35502951 +10061555 10061557 +... +``` + +------ + +### Output \_peaks.txt file +column | name | explanation +------ | ---- | ------------ +0th | peakId | id for a peak, for example peak\_chr1-chr1-1 +1th | chrom | chromosomal for the peak +2th | start | genomic coordinate of the start site +3th | end | genomic coordinate of the end site +4th | summit | genomic coordinate of peak summit +5th | length | length of the peak +6th | counts | observed reads number in the peak +7th | RPKM | RPKM for the reads density in the peak +8th | enrichmentScore | enrichment score for the peak, calculated by observed PETs number divided by the mean PETs number of nearby 10 fold and 20 fold regions +9th | poissonPvalue | Poisson test p-value for the loop after Bonferroni correction +10th | controlCounts| if control data such as input/IgG is assigned, the observed reads number in peak region for control data +11th | controlRPKM | if control data assigned, RPKM for the reads density in the peak region for control data +12th | controlScaledCount | if control data assigned, the scaled expected counts used for Poisson test/enrichment score against control data +13th | enrichmentScoreVsControl | if control data assigned, enrichment score of target vs. control +14th | poissonPvalueVsControl | if control data assigned, Poisson test p-value of target vs. control after Bonferroni correction +15th | significant | 1 or 0, 1 means we think the peak is significant compared to local background and control (if assigned) + +------ + +### Output \_loops.txt file +column | name | explanation +------ | ---- | ------------ +0th | loopId | id for a loop, for example loop\_chr1-chr1-1 +1th | chromA | chromosomal for the loop first anchor +2th | startA | genomic coordinate of the start site for the first anchor +3th | endA | genomic coordinate of the end site for the first anchor +4th | chromB | chromosomal for the loop second anchor +5th | startB | genomic coordinate of the start site for the second anchor +6th | endB | genomic coordinate of the end site for the second anchor +7th | distance | distance (bp) between the centers of the anchors for the loop +8th | centerA | genomic coordinate of the center site for the first anchor +9th | centerB | genomic coordinate of the center site for the second anchor +10th | readsA | observed PETs number for the first anchor +11th | readsB | observed PETs number for the second anchor +12th | cis | whether the loop is a intra-chromosomal loop (cis) +13th | PETs | observed PETs number linking the two anchors +14th | density | similarly to that of RPKM (reads per kilobase per million): +15th | enrichmentScore | enrichment score for the loop, calculated by observed PETs number divided by the mean PETs number of nearby permutated regions +16th | P2LL | peak to the lower left, calculated similar to that of Juicer +17th | FDR | false discovery rate for the loop, calculated as the number of permutated regions that there are more observed PETs than the region +18th | binomalPvalue | binomal test p-value for the loop, updated caculation, different from cLoops +19th | hypergeometricPvalue | hypergeometric test p-value for the loop +20th | poissonPvalue | Poisson test p-value for the loop +21th | xPeakpoissonPvalue | Poisson test p-value for the left anchor potential peak p-value +22th | yPeakpoissonPvalue | Poisson test p-value for the right anchor potential peak p-value +23th | significant | 1 or 0, 1 means we think the loop is significant compared to permutated regions. In cLoops2, only significant loops are written to the file. + +------ + +### Output \_dloops.txt file +column | name | explanation +------ | ---- | ------------ +0th | loopId | id for a loop, for example loop\_chr1-chr1-1 +1th | chromA | chromosomal for the loop first anchor +2th | startA | genomic coordinate of the start site for the first anchor +3th | endA | genomic coordinate of the end site for the first anchor +4th | chromB | chromosomal for the loop second anchor +5th | startB | genomic coordinate of the start site for the second anchor +6th | endB | genomic coordinate of the end site for the second anchor +7th | distance | distance (bp) between the centers of the anchors for the loop +8th | centerA | genomic coordinate of the center site for the first anchor +9th | centerB | genomic coordinate of the center site for the second anchor +10th | rawTargetAnchorAReads | observed PETs number for the first anchor in target sample +11th | rawTargetAnchorBReads | observed PETs number for the second anchor in target sample +12th | rawControlAnchorAReads | observed PETs number for the first anchor in control sample +13th | rawControlAnchorBReads | observed PETs number for the second anchor in control sample +14th | scaledTargetAnchorAReads | scaled PETs number for the first anchor in target sample +15th | scaledTargetAnchorBReads | scaled PETs number for the second anchor in target sample +16th | rawTargetCounts | raw PETs number for the loop in target sample +17th | scaledTargetCounts | scaled PETs number for the loop in target sample, fitting to control sample +18th | rawControlCounts | raw PETs number for the loop in control sample +19th | rawTargetNearbyMedianCounts | raw median PETs number for the loop nearby permutation regions in target sample +20th | scaledTargetNearbyMedianCounts | scaled median PETs number for the loop nearby permutation regions in target sample, fitting to control sample +21th | rawControlNearbyMedianCounts | raw median PETs number for the loop nearby permutation regions in control sample +22th | rawTargetES | target sample rawTargetCounts/rawTargetNearbyMedianCounts +23th | rawControlES | control sample rawControlCounts/rawControlNearbyMedianCounts +24th | targetDensity | raw interaction density in target sample, RPKM +25th | controlDensity | raw interaction density in control sample, RPKM +26th | rawFc | raw fold change of the interaction density, log2(target/control) +27th | scaledFc | scaled fold change of PETs, log2( scaledTargetCounts/rawControlCounts ) +28th | poissonPvalue | possion p-value for the significance test after Bonferroni correction +29th | significant | 1 or 0, 1 means we think the loop is significant differentlly enriched + +------ + +### Output \_domains.txt file +column | name | explanation +------ | ---- | ------------ +0th | domainId | id for a domain, for example domain\_0 +1th | chrom | chromosomal for the loop first anchor +2th | start | genomic coordinate of the start site for the domain +3th | end | genomic coordinate of the end site for the domain +4th | length | length of the domain +5th | binSize | bin size used for the matrix to call the domain +6th | winSize | window size used for the matrix to call the domain +7th | segregationScore | mean segregation score for all bins within the domain +8th | totalPETs | number of total PETs in the domain +9th | withinDomainPETs | number of PETs only interacting within the domain +10th | enrichmentScore | (withinDomainPETs) / (totalPETs-withinDomainPETs) +11th | density | similarly to that of RPKM (reads per kilobase per million): + +------ + +### Output \_loopsGtfAno.txt file +column | name | explanation +------ | ---- | ------------ +0th | loopId | loopId from input file +1th | typeAnchorA | annotated type of anchor a (left anchor), enhancer or promoter +2th | typeAnchorB | annotated type of anchor b (right anchor) +3th | nearestDistanceToGeneAnchorA | distance of anchor a to nearest TSS +4th | nearestDistanceToGeneAnchorB | distance of anchor b to nearest TSS +5th | nearestTargetGeneAnchorA | anchor a nearest TSS gene, for example chr21:34836286-34884882\|+\|AP000331.1 (named by rules of chrom:start-end\|strand\|geneName). If a promoter overlaps two head-to-head genes, all genes will be reported by seperation of a comma. +6th | nearestTargetGeneAnchorB | anchor b nearest TSS gene + +------ + +### Output \_mergedAnchors.txt file +column | name | explanation +------ | ---- | ------------ +0th | anchorId | id for merged anchors. For example, chr21:14025126-14026192\|Promoter (named by the rule of: chrom:start-end\|type) +1th | chrom | chromosome +2th | start | start +3th | end | end +4th | type | annotated type for the anchor, enhancer or promoter +5th | nearestDistanceToTSS | distance of anchor a to nearest TSS +6th | nearestGene | nearest gene name. If a promoter overlaps two head-to-head genes, all genes will be reported by seperation of a comma. +7th | nearestGeneLoc | neart gene information. For example, chr21:34787801-35049344\|-\|RUNX1 (named by the rule of: chrom:start-end\|strand\|name). If a promoter overlaps two head-to-head genes, all genes will be reported by seperation of a comma. + +------ + +### Output \_loop2anchors.txt file +column | name | explanation +------ | ---- | ------------ +0th | loopId | loopId from input file +1th | mergedAnchorA | original anchor a (left anchor) to new merged anchor id +2th | mergedAnchorB | original anchor b (right anchor) to new merged anchor id + +------ + +### Output \_targets.txt file +column | name | explanation +------ | ---- | ------------ +0th | promoter | annotated anchors that overlapped or very close to gene's transcription start site. For example, chr21:35043062-35051895\|Promoter (named by the rule of: chrom:start-end\|Promoter). +1th | PromoterTarget | promoter target genes. If a promoter is shared by multiple genes, all genes will be reported and seperated by comma. For example, chr21:34787801-35049344\|-\|RUNX1 (named by the rule of: chorm:start-end\|strand\|name. +2th | directEnhancer | enhancers that directly looping with target promoter. Multiple enhancers will be reported and seperated by comma. For example, chr21:35075636-35077527\|Enhancer,chr21:35026356-35028520\|Enhancer,chr21:34801302-34805056\|Enhancer. +3th | indirectEnhancer | enhancers that indirectly looping with target promoter, by enhancer-enhancer-promoter or enhancer-promoter-promoter. Multiple enhancers will be reported and seperated by comma. +4th | directPromoter | other promoters directly looping with target promoter. +5th | indirectPromoter | other promoters indirectly looping with target promoter, by promoter-enhancer-promoter or promoter-promoter-promoter. +6th | directEnhancerHub | hub of direct enhancer. If there are more than 2 direct enhancers, using HITS algorithm to find the most linked one and report. +7th | indirectEnhancerHub | hub of indirect enhancer. If there are more than 2 indirect enhancers, using HITS algorithm to find the most linked one and report. + + +-------- +-------- +## cLoops2 citations + +-------- +-------- +## cLoops2 updates + + diff --git a/cLoops2.egg-info/entry_points.txt b/cLoops2.egg-info/entry_points.txt index fe7565ed..f8d870b6 100644 --- a/cLoops2.egg-info/entry_points.txt +++ b/cLoops2.egg-info/entry_points.txt @@ -1,3 +1,2 @@ [console_scripts] cLoops2 = cLoops2.cLoops2:main - diff --git a/cLoops2/__pycache__/__init__.cpython-310.pyc b/cLoops2/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 00000000..8f28294b Binary files /dev/null and b/cLoops2/__pycache__/__init__.cpython-310.pyc differ diff --git a/cLoops2/__pycache__/agg.cpython-310.pyc b/cLoops2/__pycache__/agg.cpython-310.pyc new file mode 100644 index 00000000..bafacd6c Binary files /dev/null and b/cLoops2/__pycache__/agg.cpython-310.pyc differ diff --git a/cLoops2/__pycache__/ano.cpython-310.pyc b/cLoops2/__pycache__/ano.cpython-310.pyc new file mode 100644 index 00000000..dd923392 Binary files /dev/null and b/cLoops2/__pycache__/ano.cpython-310.pyc differ diff --git a/cLoops2/__pycache__/blockDBSCAN.cpython-310.pyc b/cLoops2/__pycache__/blockDBSCAN.cpython-310.pyc new file mode 100644 index 00000000..89d89065 Binary files /dev/null and b/cLoops2/__pycache__/blockDBSCAN.cpython-310.pyc differ diff --git a/cLoops2/__pycache__/cLoops2.cpython-310.pyc b/cLoops2/__pycache__/cLoops2.cpython-310.pyc new file mode 100644 index 00000000..aa8c4e7f Binary files /dev/null and b/cLoops2/__pycache__/cLoops2.cpython-310.pyc differ diff --git a/cLoops2/__pycache__/callCisLoops.cpython-310.pyc b/cLoops2/__pycache__/callCisLoops.cpython-310.pyc new file mode 100644 index 00000000..3221ab6f Binary files /dev/null and b/cLoops2/__pycache__/callCisLoops.cpython-310.pyc differ diff --git a/cLoops2/__pycache__/callDiffLoops.cpython-310.pyc b/cLoops2/__pycache__/callDiffLoops.cpython-310.pyc new file mode 100644 index 00000000..21cf0327 Binary files /dev/null and b/cLoops2/__pycache__/callDiffLoops.cpython-310.pyc differ diff --git a/cLoops2/__pycache__/callDomains.cpython-310.pyc b/cLoops2/__pycache__/callDomains.cpython-310.pyc new file mode 100644 index 00000000..cd692fb9 Binary files /dev/null and b/cLoops2/__pycache__/callDomains.cpython-310.pyc differ diff --git a/cLoops2/__pycache__/callPeaks.cpython-310.pyc b/cLoops2/__pycache__/callPeaks.cpython-310.pyc new file mode 100644 index 00000000..e51b24a3 Binary files /dev/null and b/cLoops2/__pycache__/callPeaks.cpython-310.pyc differ diff --git a/cLoops2/__pycache__/callTransLoops.cpython-310.pyc b/cLoops2/__pycache__/callTransLoops.cpython-310.pyc new file mode 100644 index 00000000..0c320930 Binary files /dev/null and b/cLoops2/__pycache__/callTransLoops.cpython-310.pyc differ diff --git a/cLoops2/__pycache__/cmat.cpython-310.pyc b/cLoops2/__pycache__/cmat.cpython-310.pyc new file mode 100644 index 00000000..1f3e8855 Binary files /dev/null and b/cLoops2/__pycache__/cmat.cpython-310.pyc differ diff --git a/cLoops2/__pycache__/ds.cpython-310.pyc b/cLoops2/__pycache__/ds.cpython-310.pyc new file mode 100644 index 00000000..9588f636 Binary files /dev/null and b/cLoops2/__pycache__/ds.cpython-310.pyc differ diff --git a/cLoops2/__pycache__/dump.cpython-310.pyc b/cLoops2/__pycache__/dump.cpython-310.pyc new file mode 100644 index 00000000..ed2b3b0b Binary files /dev/null and b/cLoops2/__pycache__/dump.cpython-310.pyc differ diff --git a/cLoops2/__pycache__/est.cpython-310.pyc b/cLoops2/__pycache__/est.cpython-310.pyc new file mode 100644 index 00000000..e171fa92 Binary files /dev/null and b/cLoops2/__pycache__/est.cpython-310.pyc differ diff --git a/cLoops2/__pycache__/estDis.cpython-310.pyc b/cLoops2/__pycache__/estDis.cpython-310.pyc new file mode 100644 index 00000000..82f79f6d Binary files /dev/null and b/cLoops2/__pycache__/estDis.cpython-310.pyc differ diff --git a/cLoops2/__pycache__/estSim.cpython-310.pyc b/cLoops2/__pycache__/estSim.cpython-310.pyc new file mode 100644 index 00000000..4545b2a3 Binary files /dev/null and b/cLoops2/__pycache__/estSim.cpython-310.pyc differ diff --git a/cLoops2/__pycache__/filter.cpython-310.pyc b/cLoops2/__pycache__/filter.cpython-310.pyc new file mode 100644 index 00000000..9b42a0ae Binary files /dev/null and b/cLoops2/__pycache__/filter.cpython-310.pyc differ diff --git a/cLoops2/__pycache__/findTargets.cpython-310.pyc b/cLoops2/__pycache__/findTargets.cpython-310.pyc new file mode 100644 index 00000000..a8397761 Binary files /dev/null and b/cLoops2/__pycache__/findTargets.cpython-310.pyc differ diff --git a/cLoops2/__pycache__/geo.cpython-310.pyc b/cLoops2/__pycache__/geo.cpython-310.pyc new file mode 100644 index 00000000..2b1630b8 Binary files /dev/null and b/cLoops2/__pycache__/geo.cpython-310.pyc differ diff --git a/cLoops2/__pycache__/io.cpython-310.pyc b/cLoops2/__pycache__/io.cpython-310.pyc new file mode 100644 index 00000000..60a2eccd Binary files /dev/null and b/cLoops2/__pycache__/io.cpython-310.pyc differ diff --git a/cLoops2/__pycache__/montage.cpython-310.pyc b/cLoops2/__pycache__/montage.cpython-310.pyc new file mode 100644 index 00000000..8401e014 Binary files /dev/null and b/cLoops2/__pycache__/montage.cpython-310.pyc differ diff --git a/cLoops2/__pycache__/plot.cpython-310.pyc b/cLoops2/__pycache__/plot.cpython-310.pyc new file mode 100644 index 00000000..4e090cb2 Binary files /dev/null and b/cLoops2/__pycache__/plot.cpython-310.pyc differ diff --git a/cLoops2/__pycache__/qc.cpython-310.pyc b/cLoops2/__pycache__/qc.cpython-310.pyc new file mode 100644 index 00000000..7c6c01b1 Binary files /dev/null and b/cLoops2/__pycache__/qc.cpython-310.pyc differ diff --git a/cLoops2/__pycache__/quant.cpython-310.pyc b/cLoops2/__pycache__/quant.cpython-310.pyc new file mode 100644 index 00000000..6ff9ee0e Binary files /dev/null and b/cLoops2/__pycache__/quant.cpython-310.pyc differ diff --git a/cLoops2/__pycache__/settings.cpython-310.pyc b/cLoops2/__pycache__/settings.cpython-310.pyc new file mode 100644 index 00000000..893464c9 Binary files /dev/null and b/cLoops2/__pycache__/settings.cpython-310.pyc differ diff --git a/cLoops2/__pycache__/utils.cpython-310.pyc b/cLoops2/__pycache__/utils.cpython-310.pyc new file mode 100644 index 00000000..7153a135 Binary files /dev/null and b/cLoops2/__pycache__/utils.cpython-310.pyc differ diff --git a/dist/cLoops2-0.0.5-py3.10.egg b/dist/cLoops2-0.0.5-py3.10.egg new file mode 100644 index 00000000..1dbbe044 Binary files /dev/null and b/dist/cLoops2-0.0.5-py3.10.egg differ diff --git a/scripts/tracPre2.py b/scripts/tracPre2.py index 72740409..6fafe29a 100755 --- a/scripts/tracPre2.py +++ b/scripts/tracPre2.py @@ -357,10 +357,9 @@ def main(): Batch converting from bam to bedpe. """ #prepare everything + op = help() date = time.strftime(' %Y-%m-%d', time.localtime(time.time())) logger = getLogger(fn=op.output + "/" + date.strip() + "_" + os.path.basename(__file__) + ".log") - - op = help() for t in ["bowtie2", "samtools", "bamToBed"]: if not isTool(t): logger.error("%s not exits! Please install through conda." % t)