forked from ballesterus/Utensils
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpartBreaker.py
executable file
·61 lines (52 loc) · 2.68 KB
/
partBreaker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#!/usr/bin/env python3
import argparse
import Get_fasta_from_Ref as GFR
import re
from sys import argv
import os
presab={}
presab['loci']=[]
miss=set(['-', '?'])
def Subsetfromto(FastaDict, outFile, start,end):
"""Writes a subsect multifasta file, boud at sequence indexes start and end, form sequence stored in a dictionary"""
with open(outFile, 'w') as out:
for seqID in FastaDict.keys():
if seqID not in presab.keys():
presab[seqID]=[]
seq=FastaDict[seqID][start:end]
if set(seq).issubset(miss):
presab[seqID].append('0')
out.write(">%s\n%s\n" %(seqID,seq))
else:
presab[seqID].append('1')
out.write(">%s\n%s\n" %(seqID,seq))
def WritePresAb(paDic, outfile):
"""Writes the presence absence matrix (paDic) to a text file"""
with open(outfile, 'w') as out:
for k in paDic.keys():
if k != 'loci':
out.write('%s\t%s\n' %(k, ' '.join(paDic[k])))
out.write('\nList of loci:\n%s' % ' '.join(paDic['loci']))
def main(matrix, partfile, outdir):
Smatrix=GFR.Fasta_to_Dict(matrix)
if not os.path.exists(outdir):
os.makedirs(outdir)
else:
print ('The output dir already exist!')
with open(partfile, 'r') as P:
for pline in P:
outN=pline.split(',')[0]
presab['loci'].append(outN)
outf="%s/%s.fasta" %(outdir,outN)
start=int(pline.split(',')[1].split('-')[0]) -1
end=int(pline.split(',')[1].split('-')[1])
Subsetfromto(Smatrix, outf, start, end)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='This is a simple script for breaking supermatrices in individual MSA based on a partition file. The required partition file is a two column comma separated value text file where the fisrt column indicates the name of partition, recycled to be used as the name of the output file, and the second column is an interval of the positions in the supermatrix, separated only by "-". This script deals only with consecutive data blocks. Codon partitioning is not implemented... yet.')
parser.add_argument('-in', dest = 'matrix', type = str, help = 'Input alignemnets in fasta format')
parser.add_argument('-p', dest = 'partitions', type =str, help = 'Input partition definition file: a comma separated text file with two columns, ')
parser.add_argument('-o', dest = 'outdir', help='Specify directory where to write partitions')
# parser.add_argument('-c', help="")
args = parser.parse_args()
main(args.matrix, args.partitions, args.outdir)
WritePresAb(presab, 'PAmatrix.txt')