Skip to content

Commit

Permalink
add -mask to output hard or soft masked sequences
Browse files Browse the repository at this point in the history
  • Loading branch information
zhangrengang committed Apr 16, 2024
1 parent 8938ca6 commit 4bc18fa
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 20 deletions.
20 changes: 12 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,36 +51,36 @@ TEsorter TEsorter/test/rice6.9.5.liban
```
By default, the newly released [REXdb](http://repeatexplorer.org/?page_id=918) ([viridiplantae_v3.0 + metazoa_v3](https://bitbucket.org/petrnovak/re_databases)) database is used, which is more sensitive and more common and thus is recommended.

For plants ([an example](https://raw.githubusercontent.com/oushujun/EDTA/master/database/rice6.9.5.liban)), it might be better to use only the plant database (Note that the input file is TE or LTR sequences but not genome sequences):
For plants ([an example](https://raw.githubusercontent.com/oushujun/EDTA/master/database/rice6.9.5.liban)), it might be better to use only the plant database (**Note that the input file is TE or LTR sequences but not genome sequences: ELEMENT mode**):
```
TEsorter input_file -db rexdb-plant
TEsorter TE.fasta -db rexdb-plant
```

Classical [GyDB](http://gydb.org/) can also be used:
```
TEsorter input_file -db gydb
TEsorter TE.fasta -db gydb
```
To speed up, use more processors [default=4]:
```
TEsorter input_file -p 20
TEsorter TE.fasta -p 20
```
To improve sensitivity, reduce the criteria (coverage and E-value):
```
TEsorter input_file -p 20 -cov 10 -eval 1e-2
TEsorter TE.fasta -p 20 -cov 10 -eval 1e-2
```
To improve specificity, increase the criteria and disable the pass2 mode:
```
TEsorter input_file -p 20 -cov 30 -eval 1e-5 -dp2
TEsorter TE.fasta -p 20 -cov 30 -eval 1e-5 -dp2
```
To improve sensitivity of pass-2, reduce the [80–80–80 rule](http://doi.org/10.1038/nrg2165-c3) which may be too strict for superfamily-level classification:
```
TEsorter input_file -p 20 -rule 70-30-80
TEsorter TE.fasta -p 20 -rule 70-30-80
```
To classify TE polyprotein sequences ([an example](http://www.repeatmasker.org/RMDownload.html)) or gene protein seqeunces:
```
TEsorter RepeatPeps.lib -st prot -p 20
```
Since version v1.4, a GENOME mode is supported to identify TE protein domains throughout whole genome:
Since version v1.4, a **GENOME mode (input genome sequences)** is supported to identify TE protein domains throughout whole genome:
```
TEsorter genome.fasta -genome -p 20 -prob 0.9
```
Expand Down Expand Up @@ -119,6 +119,7 @@ rice6.9.5.liban.rexdb.cls.tsv TEs/LTR-RTs classifications
Column 7: Domains, e.g. GAG|SIRE PROT|SIRE INT|SIRE RT|SIRE RH|SIRE; `none` for pass-2 classifications
rice6.9.5.liban.rexdb.cls.lib fasta library for RepeatMasker
rice6.9.5.liban.rexdb.cls.pep the same sequences as `rice6.9.5.liban.rexdb.dom.faa`, but id is changed with classifications.
rice6.9.5.liban.rexdb.*masked sequences masking the TE domains
```
Note: the GENOME mode (`-genome`) will not output `*.cls.*` files.

Expand Down Expand Up @@ -158,6 +159,9 @@ options:
maxinum E-value for protein domains in HMMScan output [default=0.001]
-prob MIN_PROBABILITY, --min-probability MIN_PROBABILITY
mininum posterior probability for protein domains in HMMScan output [default=0.5]
-mask {soft,hard} [{soft,hard} ...]
output masked sequences (soft: masking with lowercase;
hard: masking with N) [default=None]
-nocln, --no-cleanup do not clean up the temporary directory [default=False]
-cite, --citation print the citation and exit [default=False]
Expand Down
20 changes: 17 additions & 3 deletions TEsorter/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,9 @@ def Args():
parser.add_argument("-prob", "--min-probability", action="store",
default=0.5, type=float,
help="mininum posterior probability for protein domains in HMMScan output [default=%(default)s]")
parser.add_argument("-mask", action="store", type=str, nargs='+',
default=None, choices=['soft', 'hard'],
help="output masked sequences (soft: masking with lowercase; hard: masking with N) [default=%(default)s]")
parser.add_argument("-nocln", "--no-cleanup", action="store_true",
default=False,
help="do not clean up the temporary directory [default=%(default)s]")
Expand Down Expand Up @@ -216,7 +219,7 @@ def pipeline(args):
#print(open(args.sequence))
#print([(rc.id, len(rc.seq)) for rc in SeqIO.parse(open(args.sequence), 'fasta')])
seq_type = 'nucl'
genomeAnn(genome=args.sequence,
gff, geneSeq = genomeAnn(genome=args.sequence,
window_size=args.win_size, window_ovl=args.win_ovl,
hmmdb = db_file,
db_name = db_name,
Expand All @@ -229,9 +232,10 @@ def pipeline(args):
maxeval = args.max_evalue,
minprob = args.min_probability,
)
mask_gff3(args.sequence, gff, args.prefix, types=args.mask)
cleanup(args)
logger.info( 'Pipeline done.' )
return
return # genome mode stop at here
logger.info( 'Start classifying pipeline (ELEMENT mode)' )
lens = [len(rc.seq) for rc in SeqIO.parse(open(args.sequence), 'fasta')]
if max(lens) > 1e6:
Expand Down Expand Up @@ -332,7 +336,16 @@ def pipeline(args):
summary(d_class)
cleanup(args)
logger.info( 'Pipeline done.' )

def mask_gff3(inSeq, inRM, outPrefix, types=['hard'], **kargs):
if not types:
return
from .modules.RepeatMasker import mask
for type in types:
outSeqfile = '{}.{}masked'.format(outPrefix, type)
soft = True if type == 'soft' else False
logger.info( '{}-masking `{}`; output `{}`'.format(type, inSeq, outSeqfile) )
with open(outSeqfile, 'w') as outSeq:
mask(inSeq, inRM, outSeq, soft=soft, **kargs)
def cleanup(args):
# clean up
if not args.no_cleanup:
Expand Down Expand Up @@ -1129,6 +1142,7 @@ def genomeAnn(genome, tmpdir='./tmp', seqfmt='fasta',window_size=1e6, window_ovl
gff, geneSeq = LTRlibAnn(ltrlib=cutSeq, genome=True, tmpdir=tmpdir, **kargs)
logger.info('Summary of classifications:')
summary_genome(gff, fout=sys.stdout)
return gff, geneSeq

def LTRlibAnn(ltrlib, hmmdb='rexdb', db_name='rexdb', seqtype='nucl', prefix=None,
force_write_hmmscan=False, genome=False,
Expand Down
6 changes: 3 additions & 3 deletions TEsorter/modules/RepeatMasker.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,20 +161,20 @@ def mask(inSeq, inRM, outSeq, exclude=None, suffix=None, simpleSoft=False, exclu
regions2 = d_simple.get(rc.id, [])
for start, end in sorted(regions2):
seq[start:end] = list(map(lower, seq[start:end]))
print('total {}, exclude {}, hard {}, soft {}.'.format(len(set(d_mask[rc.id])), len(set(d_exclude.get(rc.id, []))), len(regions), len(regions2)), file=sys.stderr)
# print('total {}, exclude {}, hard {}, soft {}.'.format(len(set(d_mask[rc.id])), len(set(d_exclude.get(rc.id, []))), len(regions), len(regions2)), file=sys.stderr)
d_counter = Counter(list(seq))
num_N = d_counter['n'] + d_counter['N']
seq = ''.join(seq)
seq = Seq(seq)
if not len(seq) == len(rc.seq):
print('\tERROR', rc.id, len(seq), len(rc.seq), len(regions), file=sys.stderr)
print(rc.id, len(seq), num_N, 100.0*(len(seq)-num_N)/len(seq), file=sys.stderr)
# print(rc.id, len(seq), num_N, 100.0*(len(seq)-num_N)/len(seq), file=sys.stderr)
if len(seq) == num_N:
all_n += 1
assert len(seq) == len(rc.seq)
rc.seq = seq
SeqIO.write(rc, outSeq, 'fasta')
print(all_n, 'all N', file=sys.stderr)
#print(all_n, 'all N', file=sys.stderr)
def update(dt, dq):
for key, value in dq.items():
dt[key] = set(dt.get(key, [])) | set(value)
Expand Down
14 changes: 8 additions & 6 deletions TEsorter/modules/RunCmdsMP.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,12 @@
import drmaa # for grid
GRID = True
from tempfile import NamedTemporaryFile
except (RuntimeError,ImportError,AttributeError) as e:
# if "DRMAA_LIBRARY_PATH" in format(e):
# logger.warning('Grid computing is not available because DRMAA not configured properly: {}'.format(e))
# else:
# logger.warning('Grid computing is not available because DRMAA not installed: {}'.format(e))
# logger.info('No DRMAA, Switching to local/cluster mode.')
except (RuntimeError,ImportError,AttributeError,OSError) as e:
if "DRMAA_LIBRARY_PATH" in format(e):
logger.warning('Grid computing is not available because DRMAA not configured properly: {}'.format(e))
else:
logger.warning('Grid computing is not available because DRMAA not installed: {}'.format(e))
logger.info('No DRMAA (see https://github.com/pygridtools/drmaa-python), Switching to local mode.')
GRID = False

__version__ = '1.1'
Expand Down Expand Up @@ -149,6 +149,7 @@ def which_grid(self):
return 'slurm'
else:
logger.warn('Please provide your grid system `{}` to auther'.format(name))
return 'sge'
def run_tasks(cmd_list, tc_tasks=None, mode='grid', grid_opts='', cpu=1, mem='1g', cont=1,
retry=1, script=None, out_path=None, completed=None, cmd_sep='\n', **kargs):
if not cmd_list:
Expand Down Expand Up @@ -498,6 +499,7 @@ def main():
def run_job(cmd_file=None, cmd_list=None, by_bin=1, tc_tasks=8, mode='grid', grid_opts='-tc {tc}', cont=1, fail_exit=True,
ckpt=None, retry=1, out_path=None, cmd_sep='\n', **kargs):
if not GRID and mode == 'grid':
logger.info('GRID is not available. Turning to the local mode..')
mode = 'local'
tc_tasks = int(tc_tasks)
if cmd_file is None:
Expand Down

0 comments on commit 4bc18fa

Please sign in to comment.