-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathreorder_alignment_by_tree.py
executable file
·56 lines (46 loc) · 2.04 KB
/
reorder_alignment_by_tree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#!/usr/bin/env python
#
# reorder_alignment_by_tree.py created 2017-11-03
'''reorder_alignment_by_tree.py last modified 2021-11-10
reorder_alignment_by_tree.py -a matrix.phy -T tree.nex -f phylip-relaxed > reordered.aln
large matrices can be gzipped, as .gz
for partitioned alignments, formats (-f) include:
clustal, fasta, nexus, phylip, phylip-relaxed, stockholm
'''
import sys
import os
import argparse
import time
import gzip
from collections import defaultdict,Counter
from Bio import AlignIO
from Bio import Phylo
def main(argv, wayout):
if not len(argv):
argv.append('-h')
parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=__doc__)
parser.add_argument('-a','--alignment', help="supermatrix alignment")
parser.add_argument('-f','--format', default="fasta", help="alignment format [fasta]")
parser.add_argument('-T','--matrix-tree', help="optional Nexus-format tree to reorder matrix")
args = parser.parse_args(argv)
if args.alignment.rsplit('.',1)[1]=="gz": # autodetect gzip format
opentype = gzip.open
sys.stderr.write( "# reading alignment {} as gzipped {}\n".format(args.alignment, time.asctime() ) )
else: # otherwise assume normal open
opentype = open
sys.stderr.write( "# reading alignment {} {}\n".format(args.alignment, time.asctime() ) )
alignedseqs = AlignIO.read(opentype(args.alignment), args.format)
sys.stderr.write( "# Alignment contains {} taxa for {} sites, including gaps\n".format( len(alignedseqs), alignedseqs.get_alignment_length() ) )
seqdict = {}
for seqrec in alignedseqs:
seqdict[seqrec.id] = seqrec
sys.stderr.write( "# reading tree {} {}\n".format(args.matrix_tree, time.asctime() ) )
tree = Phylo.read(args.matrix_tree,"nexus")
cladecount = 0
for clade in tree.get_terminals():
cleanname = str(clade.name).replace("'","").replace('"','')
wayout.write( seqdict[cleanname].format("fasta") )
cladecount += 1
sys.stderr.write( "# wrote {} taxa {}\n".format(cladecount, time.asctime() ) )
if __name__ == "__main__":
main(sys.argv[1:], sys.stdout)