diff --git a/README.md b/README.md index ab8a040..feae491 100644 --- a/README.md +++ b/README.md @@ -192,6 +192,8 @@ tmp/ 4. It may not work for mesopolyploids, and not work for paleopolyploids, of which subgenome-specific TEs have been eliminated. However, the genetic boundary is not very clear. 5. Other unknown cases can be reported to me. +When it do not work, you may try another [pipeline](https://github.com/zhangrengang/subgenome_phasing_example) based on evidence of synteny, orthology and phylogeny. + ### Citation ### If you use `SubPhaser`, please cite: > Jia KH, Wang ZX, Wang L et. al. SubPhaser: A robust allopolyploid subgenome phasing method based on subgenome-specific k-mers [J]. *New Phytologist*, 2022, 235: 801-809 [DOI:10.1111/nph.18173](https://doi.org/10.1111/nph.18173) diff --git a/subphaser/Circos.py b/subphaser/Circos.py index 0366d50..b8b66a8 100644 --- a/subphaser/Circos.py +++ b/subphaser/Circos.py @@ -397,7 +397,8 @@ def centomics_plot(genome, wddir='circos', tr_bed='', tr_labels='', dstfig = '{}.{}'.format(prefix, figfmt) try: os.remove(dstfig) except FileNotFoundError: pass - os.link(figfile, dstfig) + try: os.link(figfile, dstfig) + except PermissionError: shutil.move(figfile, dstfig) # legend lefig = '{}_legend.pdf'.format(prefix) diff --git a/subphaser/Jellyfish.py b/subphaser/Jellyfish.py index bacba29..bb6b38d 100644 --- a/subphaser/Jellyfish.py +++ b/subphaser/Jellyfish.py @@ -244,11 +244,15 @@ def to_gemma(self, prefix, tmpdir, genome=None, mapq=1): return ref= '{}/ref'.format(tmpdir, ) - cmd = '[ ! -f {1}.ok ] && bowtie-build {0} {1} && touch {1}.ok'.format(genome, ref) - run_cmd(cmd, log=True) +# cmd = '[ ! -f {1}.ok ] && bowtie-build {0} {1} && touch {1}.ok'.format(genome, ref) + cmd = '[ ! -f {1}.ok ] && bwa index {0} -p {1} && touch {1}.ok'.format(genome, ref) + run_cmd(cmd, log=True, fail_exit=False) outsam = prefix + '.sam' - cmd = '''bowtie {ref} {reads} -f -p 10 -S --best -M 1 --sam-nohead 2> {reads}.map.log \ - | awk '$5>={mapq}{{if ($3=="*"){{$3="chr0";$4=NR*1}}; print $1"\t"$4"\t"$3}}' > {snp}'''.format( + cmd = '''bwa mem -t 10 -k 15 {ref} {reads} | awk '$1 !~ "^@"' ''' +# cmd = '''bowtie {ref} {reads} -f -p 10 -S --best -M 1 --sam-nohead 2> {reads}.map.log ''' + + cmd += ''' | awk '$5>={mapq}{{if ($3=="*"){{$3="chr0";$4=NR*1}}; print $1"\t"$4"\t"$3}}' > {snp}''' + cmd = cmd.format( ref=ref, reads=outfa, snp=outsnp, mapq=mapq) run_cmd(cmd, log=True) diff --git a/subphaser/api/TEsorter/app.py b/subphaser/api/TEsorter/app.py index 63beb56..a2e90e8 100755 --- a/subphaser/api/TEsorter/app.py +++ b/subphaser/api/TEsorter/app.py @@ -92,6 +92,13 @@ def Args(): parser.add_argument("-prob", "--min-probability", action="store", default=0.5, type=float, help="mininum posterior probability for protein domains in HMMScan output [default=%(default)s]") + parser.add_argument("-score", "--min-score", action="store", + default=0.1, type=float, + help="mininum score for protein domains in HMMScan output [default=%(default)s]") + + parser.add_argument("-mask", action="store", type=str, nargs='+', + default=None, choices=['soft', 'hard'], + help="output masked sequences (soft: masking with lowercase; hard: masking with N) [default=%(default)s]") parser.add_argument("-nocln", "--no-cleanup", action="store_true", default=False, help="do not clean up the temporary directory [default=%(default)s]") @@ -211,27 +218,33 @@ def pipeline(args): logger.info( 'check database '+ db_file) check_db(db_file) + gap = 'X' if args.seq_type == 'prot' else 'N' + kargs = dict(hmmdb = db_file, + db_name = db_name, + prefix = args.prefix, + force_write_hmmscan = args.force_write_hmmscan, + processors = args.processors, + tmpdir = args.tmp_dir, + mincov = args.min_coverage, + maxeval = args.max_evalue, + minprob = args.min_probability, + minscore = args.min_score + ) + if args.genome: logger.info( 'Start identifying pipeline (GENOME mode)' ) #print(open(args.sequence)) #print([(rc.id, len(rc.seq)) for rc in SeqIO.parse(open(args.sequence), 'fasta')]) seq_type = 'nucl' - genomeAnn(genome=args.sequence, + gff, geneSeq = genomeAnn(genome=args.sequence, window_size=args.win_size, window_ovl=args.win_ovl, - hmmdb = db_file, - db_name = db_name, seqtype = seq_type, - prefix = args.prefix, - force_write_hmmscan = args.force_write_hmmscan, - processors = args.processors, - tmpdir = args.tmp_dir, - mincov = args.min_coverage, - maxeval = args.max_evalue, - minprob = args.min_probability, + **kargs ) + mask_gff3(args.sequence, gff, args.prefix, types=args.mask, gap=gap) cleanup(args) logger.info( 'Pipeline done.' ) - return + return # genome mode stop at here logger.info( 'Start classifying pipeline (ELEMENT mode)' ) lens = [len(rc.seq) for rc in SeqIO.parse(open(args.sequence), 'fasta')] if max(lens) > 1e6: @@ -243,18 +256,11 @@ def pipeline(args): seq_type = 'prot' if db_name == 'sine' else args.seq_type gff, geneSeq = LTRlibAnn( ltrlib = args.sequence, - hmmdb = db_file, - db_name = db_name, seqtype = seq_type, - prefix = args.prefix, - force_write_hmmscan = args.force_write_hmmscan, - processors = args.processors, - tmpdir = args.tmp_dir, - mincov = args.min_coverage, - maxeval = args.max_evalue, - minprob = args.min_probability, + **kargs ) + mask_gff3(args.sequence, gff, args.prefix, types=args.mask, gap=gap) # classify classify_out = args.prefix + '.cls.tsv' fc = open(classify_out, 'w') @@ -332,7 +338,17 @@ def pipeline(args): summary(d_class) cleanup(args) logger.info( 'Pipeline done.' ) - +def mask_gff3(inSeq, inRM, outPrefix, types=['hard'], **kargs): + if not types: + return + from .modules.RepeatMasker import mask + for type in types: + outSeqfile = '{}.{}masked'.format(outPrefix, type) + soft = True if type == 'soft' else False + logger.info( '{}-masking `{}`; output `{}`'.format(type, inSeq, outSeqfile) ) + with open(outSeqfile, 'w') as outSeq: + masked, total = mask(inSeq, inRM, outSeq, soft=soft, **kargs) + logger.info('{} / {} ({:.2%}) masked'.format(masked, total, masked/total)) def cleanup(args): # clean up if not args.no_cleanup: @@ -920,7 +936,7 @@ def format_gff_id(id): return re.compile(r'[;=\|]').sub("_", id) def hmm2best(inSeqs, inHmmouts, nucl_len=None, prefix=None, db='rexdb', seqtype='nucl', - mincov=20, maxeval=1e-3, minprob=0.6, genome=False): + mincov=20, maxeval=1e-3, minprob=0.6, minscore=0.5, genome=False): if prefix is None: prefix = inSeqs[0] if nucl_len is None and seqtype=='nucl': @@ -934,7 +950,7 @@ def hmm2best(inSeqs, inHmmouts, nucl_len=None, prefix=None, db='rexdb', seqtype= qid, s, e, domain = key else: qid, domain = key - if rc.hmmcov < mincov or rc.evalue > maxeval or rc.acc < minprob: + if rc.hmmcov < mincov or rc.evalue > maxeval or rc.acc < minprob or rc.score < minscore: continue rawid = qid gene,clade = parse_hmmname(rc.tname, db=db) @@ -1129,11 +1145,12 @@ def genomeAnn(genome, tmpdir='./tmp', seqfmt='fasta',window_size=1e6, window_ovl gff, geneSeq = LTRlibAnn(ltrlib=cutSeq, genome=True, tmpdir=tmpdir, **kargs) logger.info('Summary of classifications:') summary_genome(gff, fout=sys.stdout) + return gff, geneSeq def LTRlibAnn(ltrlib, hmmdb='rexdb', db_name='rexdb', seqtype='nucl', prefix=None, force_write_hmmscan=False, genome=False, processors=4, tmpdir='./tmp', - mincov=20, maxeval=1e-3, minprob=0.5): + mincov=20, maxeval=1e-3, minprob=0.5, minscore=0.5): if prefix is None: prefix = '{}.{}'.format(ltrlib, hmmdb) bin = 'hmmscan' @@ -1149,7 +1166,7 @@ def LTRlibAnn(ltrlib, hmmdb='rexdb', db_name='rexdb', seqtype='nucl', prefix=No processors=processors, bin=bin, seqtype=seqtype, force_write_hmmscan=force_write_hmmscan) logger.info( 'generating gene anntations' ) gff, geneSeq = hmm2best(chunk_files, [domtbl], db=db_name, nucl_len=d_nucl_len, genome=genome, - prefix=prefix, seqtype=seqtype, mincov=mincov, maxeval=maxeval, minprob=minprob) + prefix=prefix, seqtype=seqtype, mincov=mincov, maxeval=maxeval, minprob=minprob, minscore=minscore) return gff, geneSeq def replaceCls(ltrlib, seqtype='nucl', db='rexdb'): diff --git a/subphaser/api/TEsorter/version.py b/subphaser/api/TEsorter/version.py index adf1ed5..1be3a88 100644 --- a/subphaser/api/TEsorter/version.py +++ b/subphaser/api/TEsorter/version.py @@ -1 +1 @@ -__version__ = '1.4.6' +__version__ = '1.4.7' diff --git a/subphaser/colors.py b/subphaser/colors.py index de9cc06..0ebbfce 100644 --- a/subphaser/colors.py +++ b/subphaser/colors.py @@ -4,7 +4,7 @@ from matplotlib import cm import matplotlib -COLORS_HEX = ['#f9c00c', '#00b9f1', '#7200da', '#f9320c', +COLORS_HEX = ['#f9c00c', '#00b9f1', '#7200da', '#f9320c', '#00b8a9', "#F4A460", '#009999', '#00C02E', '#980000','#00ffff','#0000ff','#ff0000','#4a86e8','#ff9900','#ffff00', '#00ff00','#9900ff','#ff00ff','#20124d','#274e13','#000000','#cccccc', '#7f6000','#a64d79','#6aa84f','#fff2cc','#47a952','#3ea6b6','#a5b805','#8f9276','#ca8d7c'] @@ -26,6 +26,8 @@ def colors_r(self): return 'c({})'.format(','.join(map(repr, self.colors_hex))) def __str__(self): return str(self.colors_hex) + def __repr__(self): + return str(self.colors_hex) class Colors: def __init__(self, n):