diff --git a/README.md b/README.md index 124206a..2c30390 100644 --- a/README.md +++ b/README.md @@ -154,8 +154,8 @@ mafft --auto rice6.9.5.liban.rexdb.cls.pep.INT_TPase.faa > rice6.9.5.liban.rexdb ``` Note: the domain names between rexdb and gydb are different: PROT (rexdb) = AP (gydb), RH (rexdb) = RNaseH (gydb). You should use the actual domain name. -### extracting TE sequences from genome for TEsorter ### -Here are examples to extract TE sequences from outputs of wide-used softwares. +### Extracting TE sequences from genome for TEsorter ### +Here are examples to extract TE sequences from outputs of wide-used softwares, when you have only genome sequences. 1. extract all TE sequences from [RepeatMasker](http://www.repeatmasker.org/RMDownload.html) output: ``` diff --git a/bin/LTR_retriever.py b/bin/LTR_retriever.py index 4503c9c..663c7f5 100644 --- a/bin/LTR_retriever.py +++ b/bin/LTR_retriever.py @@ -82,7 +82,8 @@ def get_full_seqs(self, fout=sys.stdout): d_seqs = seq2dict(self.genome) for rc in self.intact_list(): ltr_seq = d_seqs[rc.chr].seq[rc.start-1:rc.end] - print >> fout, '>{}\n{}'.format(rc.LTR_loc, ltr_seq) + ltr_cls = '{}/{}'.format(rc.TE_type, rc.SuperFamily) + print >> fout, '>{}#{}\n{}'.format(rc.LTR_loc, ltr_cls, ltr_seq) def re_scn(self): # remove redundant idmap = self.seqIdmap lrt_set = set([]) diff --git a/bin/RepeatMasker.py b/bin/RepeatMasker.py index ec7d4af..baae9ec 100644 --- a/bin/RepeatMasker.py +++ b/bin/RepeatMasker.py @@ -24,7 +24,9 @@ def __init__(self, line): convert = [int, float, float, float, str, int, int, str, str, str, str, str, int, str, str ] - assert len(temp) == len(title) or (len(temp)-1 == len(title) and temp[-1] == "*") + assert len(temp) == len(title) or (len(temp)-1 == len(title) and temp[-1] == "*") or (len(temp) == len(title)-1) +# try: assert len(temp) == len(title) or (len(temp)-1 == len(title) and temp[-1] == "*") +# except AssertionError: print >> sys.stderr, temp, '\n', title self.__dict__ = {key: func(value) for key,value,func in zip(title, temp, convert)} self.query_left = int(self.query_left.strip('()')) self.repeat_begin = int(self.repeat_begin.strip('()')) @@ -70,6 +72,7 @@ def get_seq(self, seqRecord): id = '{}:{}..{}|{}#{}'.format(self.query_id, self.query_begin, self.query_end, self.repeat_family, self.super_class) teRecord = seqRecord[self.query_begin-1:self.query_end] teRecord.id = id + teRecord.description = id return teRecord class RMOutParser():