-
Notifications
You must be signed in to change notification settings - Fork 16
/
Copy pathcd-hit2fasta.py
48 lines (39 loc) · 1.55 KB
/
cd-hit2fasta.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
"""
Parse a cd-hit output file and a fasta file and make a directory of clusters.
NOTE: In this case the sequence id's are ALL numbers (i.e. we have used renumber_fasta.py on them)
"""
import os
import sys
import argparse
from roblib import sequences
import re
__author__ = 'Rob Edwards'
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Parse a cd-hit output file and a fasta file and make a directory of clusters')
parser.add_argument('-f', help='fasta file for DNA sequences', required=True)
parser.add_argument('-c', help='cd-hit output file with the clusters', required=True)
parser.add_argument('-o', help='output directory', required=True)
args = parser.parse_args()
fa = sequences.read_fasta(args.f)
if os.path.exists(args.o):
sys.stderr.write("ERROR: OUTPUT DIRECTORY {} EXISTS\n".format(args.o))
sys.exit(-1)
os.mkdir(args.o)
cluster = None
out = None
with open(args.c, 'r') as fin:
for l in fin:
if l.startswith('>Cluster'):
m=re.match('>Cluster\s+(\d+)', l)
cluster = m.group(1)
if out:
out.close()
out = open(os.path.join(args.o, "{}.fa".format(cluster)), 'w')
else:
m=re.search('>(\d+)\.\.\.', l)
if not m:
sys.stderr.write("Could not find a seq id in: {}".format(l))
continue
seqid = m.group(1)
out.write(">{}\n{}\n".format(seqid, fa[seqid]))
out.close()