Skip to content

Commit

Permalink
update readme
Browse files Browse the repository at this point in the history
  • Loading branch information
zhangrengang committed Apr 24, 2024
1 parent 6f6139b commit 28aa05b
Show file tree
Hide file tree
Showing 6 changed files with 58 additions and 32 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,8 @@ tmp/
4. It may not work for mesopolyploids, and not work for paleopolyploids, of which subgenome-specific TEs have been eliminated. However, the genetic boundary is not very clear.
5. Other unknown cases can be reported to me.

When it do not work, you may try another [pipeline](https://github.com/zhangrengang/subgenome_phasing_example) based on evidence of synteny, orthology and phylogeny.

### Citation ###
If you use `SubPhaser`, please cite:
> Jia KH, Wang ZX, Wang L et. al. SubPhaser: A robust allopolyploid subgenome phasing method based on subgenome-specific k-mers [J]. *New Phytologist*, 2022, 235: 801-809 [DOI:10.1111/nph.18173](https://doi.org/10.1111/nph.18173)
Expand Down
3 changes: 2 additions & 1 deletion subphaser/Circos.py
Original file line number Diff line number Diff line change
Expand Up @@ -397,7 +397,8 @@ def centomics_plot(genome, wddir='circos', tr_bed='', tr_labels='',
dstfig = '{}.{}'.format(prefix, figfmt)
try: os.remove(dstfig)
except FileNotFoundError: pass
os.link(figfile, dstfig)
try: os.link(figfile, dstfig)
except PermissionError: shutil.move(figfile, dstfig)

# legend
lefig = '{}_legend.pdf'.format(prefix)
Expand Down
12 changes: 8 additions & 4 deletions subphaser/Jellyfish.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,11 +244,15 @@ def to_gemma(self, prefix, tmpdir, genome=None, mapq=1):
return

ref= '{}/ref'.format(tmpdir, )
cmd = '[ ! -f {1}.ok ] && bowtie-build {0} {1} && touch {1}.ok'.format(genome, ref)
run_cmd(cmd, log=True)
# cmd = '[ ! -f {1}.ok ] && bowtie-build {0} {1} && touch {1}.ok'.format(genome, ref)
cmd = '[ ! -f {1}.ok ] && bwa index {0} -p {1} && touch {1}.ok'.format(genome, ref)
run_cmd(cmd, log=True, fail_exit=False)
outsam = prefix + '.sam'
cmd = '''bowtie {ref} {reads} -f -p 10 -S --best -M 1 --sam-nohead 2> {reads}.map.log \
| awk '$5>={mapq}{{if ($3=="*"){{$3="chr0";$4=NR*1}}; print $1"\t"$4"\t"$3}}' > {snp}'''.format(
cmd = '''bwa mem -t 10 -k 15 {ref} {reads} | awk '$1 !~ "^@"' '''
# cmd = '''bowtie {ref} {reads} -f -p 10 -S --best -M 1 --sam-nohead 2> {reads}.map.log '''

cmd += ''' | awk '$5>={mapq}{{if ($3=="*"){{$3="chr0";$4=NR*1}}; print $1"\t"$4"\t"$3}}' > {snp}'''
cmd = cmd.format(
ref=ref, reads=outfa, snp=outsnp, mapq=mapq)
run_cmd(cmd, log=True)

Expand Down
67 changes: 42 additions & 25 deletions subphaser/api/TEsorter/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,13 @@ def Args():
parser.add_argument("-prob", "--min-probability", action="store",
default=0.5, type=float,
help="mininum posterior probability for protein domains in HMMScan output [default=%(default)s]")
parser.add_argument("-score", "--min-score", action="store",
default=0.1, type=float,
help="mininum score for protein domains in HMMScan output [default=%(default)s]")

parser.add_argument("-mask", action="store", type=str, nargs='+',
default=None, choices=['soft', 'hard'],
help="output masked sequences (soft: masking with lowercase; hard: masking with N) [default=%(default)s]")
parser.add_argument("-nocln", "--no-cleanup", action="store_true",
default=False,
help="do not clean up the temporary directory [default=%(default)s]")
Expand Down Expand Up @@ -211,27 +218,33 @@ def pipeline(args):
logger.info( 'check database '+ db_file)
check_db(db_file)

gap = 'X' if args.seq_type == 'prot' else 'N'
kargs = dict(hmmdb = db_file,
db_name = db_name,
prefix = args.prefix,
force_write_hmmscan = args.force_write_hmmscan,
processors = args.processors,
tmpdir = args.tmp_dir,
mincov = args.min_coverage,
maxeval = args.max_evalue,
minprob = args.min_probability,
minscore = args.min_score
)

if args.genome:
logger.info( 'Start identifying pipeline (GENOME mode)' )
#print(open(args.sequence))
#print([(rc.id, len(rc.seq)) for rc in SeqIO.parse(open(args.sequence), 'fasta')])
seq_type = 'nucl'
genomeAnn(genome=args.sequence,
gff, geneSeq = genomeAnn(genome=args.sequence,
window_size=args.win_size, window_ovl=args.win_ovl,
hmmdb = db_file,
db_name = db_name,
seqtype = seq_type,
prefix = args.prefix,
force_write_hmmscan = args.force_write_hmmscan,
processors = args.processors,
tmpdir = args.tmp_dir,
mincov = args.min_coverage,
maxeval = args.max_evalue,
minprob = args.min_probability,
**kargs
)
mask_gff3(args.sequence, gff, args.prefix, types=args.mask, gap=gap)
cleanup(args)
logger.info( 'Pipeline done.' )
return
return # genome mode stop at here
logger.info( 'Start classifying pipeline (ELEMENT mode)' )
lens = [len(rc.seq) for rc in SeqIO.parse(open(args.sequence), 'fasta')]
if max(lens) > 1e6:
Expand All @@ -243,18 +256,11 @@ def pipeline(args):
seq_type = 'prot' if db_name == 'sine' else args.seq_type
gff, geneSeq = LTRlibAnn(
ltrlib = args.sequence,
hmmdb = db_file,
db_name = db_name,
seqtype = seq_type,
prefix = args.prefix,
force_write_hmmscan = args.force_write_hmmscan,
processors = args.processors,
tmpdir = args.tmp_dir,
mincov = args.min_coverage,
maxeval = args.max_evalue,
minprob = args.min_probability,
**kargs
)

mask_gff3(args.sequence, gff, args.prefix, types=args.mask, gap=gap)
# classify
classify_out = args.prefix + '.cls.tsv'
fc = open(classify_out, 'w')
Expand Down Expand Up @@ -332,7 +338,17 @@ def pipeline(args):
summary(d_class)
cleanup(args)
logger.info( 'Pipeline done.' )

def mask_gff3(inSeq, inRM, outPrefix, types=['hard'], **kargs):
if not types:
return
from .modules.RepeatMasker import mask
for type in types:
outSeqfile = '{}.{}masked'.format(outPrefix, type)
soft = True if type == 'soft' else False
logger.info( '{}-masking `{}`; output `{}`'.format(type, inSeq, outSeqfile) )
with open(outSeqfile, 'w') as outSeq:
masked, total = mask(inSeq, inRM, outSeq, soft=soft, **kargs)
logger.info('{} / {} ({:.2%}) masked'.format(masked, total, masked/total))
def cleanup(args):
# clean up
if not args.no_cleanup:
Expand Down Expand Up @@ -920,7 +936,7 @@ def format_gff_id(id):
return re.compile(r'[;=\|]').sub("_", id)

def hmm2best(inSeqs, inHmmouts, nucl_len=None, prefix=None, db='rexdb', seqtype='nucl',
mincov=20, maxeval=1e-3, minprob=0.6, genome=False):
mincov=20, maxeval=1e-3, minprob=0.6, minscore=0.5, genome=False):
if prefix is None:
prefix = inSeqs[0]
if nucl_len is None and seqtype=='nucl':
Expand All @@ -934,7 +950,7 @@ def hmm2best(inSeqs, inHmmouts, nucl_len=None, prefix=None, db='rexdb', seqtype=
qid, s, e, domain = key
else:
qid, domain = key
if rc.hmmcov < mincov or rc.evalue > maxeval or rc.acc < minprob:
if rc.hmmcov < mincov or rc.evalue > maxeval or rc.acc < minprob or rc.score < minscore:
continue
rawid = qid
gene,clade = parse_hmmname(rc.tname, db=db)
Expand Down Expand Up @@ -1129,11 +1145,12 @@ def genomeAnn(genome, tmpdir='./tmp', seqfmt='fasta',window_size=1e6, window_ovl
gff, geneSeq = LTRlibAnn(ltrlib=cutSeq, genome=True, tmpdir=tmpdir, **kargs)
logger.info('Summary of classifications:')
summary_genome(gff, fout=sys.stdout)
return gff, geneSeq

def LTRlibAnn(ltrlib, hmmdb='rexdb', db_name='rexdb', seqtype='nucl', prefix=None,
force_write_hmmscan=False, genome=False,
processors=4, tmpdir='./tmp',
mincov=20, maxeval=1e-3, minprob=0.5):
mincov=20, maxeval=1e-3, minprob=0.5, minscore=0.5):
if prefix is None:
prefix = '{}.{}'.format(ltrlib, hmmdb)
bin = 'hmmscan'
Expand All @@ -1149,7 +1166,7 @@ def LTRlibAnn(ltrlib, hmmdb='rexdb', db_name='rexdb', seqtype='nucl', prefix=No
processors=processors, bin=bin, seqtype=seqtype, force_write_hmmscan=force_write_hmmscan)
logger.info( 'generating gene anntations' )
gff, geneSeq = hmm2best(chunk_files, [domtbl], db=db_name, nucl_len=d_nucl_len, genome=genome,
prefix=prefix, seqtype=seqtype, mincov=mincov, maxeval=maxeval, minprob=minprob)
prefix=prefix, seqtype=seqtype, mincov=mincov, maxeval=maxeval, minprob=minprob, minscore=minscore)
return gff, geneSeq

def replaceCls(ltrlib, seqtype='nucl', db='rexdb'):
Expand Down
2 changes: 1 addition & 1 deletion subphaser/api/TEsorter/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '1.4.6'
__version__ = '1.4.7'
4 changes: 3 additions & 1 deletion subphaser/colors.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from matplotlib import cm
import matplotlib

COLORS_HEX = ['#f9c00c', '#00b9f1', '#7200da', '#f9320c',
COLORS_HEX = ['#f9c00c', '#00b9f1', '#7200da', '#f9320c', '#00b8a9', "#F4A460", '#009999', '#00C02E',
'#980000','#00ffff','#0000ff','#ff0000','#4a86e8','#ff9900','#ffff00',
'#00ff00','#9900ff','#ff00ff','#20124d','#274e13','#000000','#cccccc',
'#7f6000','#a64d79','#6aa84f','#fff2cc','#47a952','#3ea6b6','#a5b805','#8f9276','#ca8d7c']
Expand All @@ -26,6 +26,8 @@ def colors_r(self):
return 'c({})'.format(','.join(map(repr, self.colors_hex)))
def __str__(self):
return str(self.colors_hex)
def __repr__(self):
return str(self.colors_hex)

class Colors:
def __init__(self, n):
Expand Down

0 comments on commit 28aa05b

Please sign in to comment.