-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathSCUBAT_v2.py
executable file
·636 lines (554 loc) · 24.5 KB
/
SCUBAT_v2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
#!/usr/bin/env python
import sys
import re
import argparse
import numpy
import time
import subprocess
from string import maketrans
from Bio import SearchIO
from Bio import SeqIO
from Bio.Seq import Seq
from Bio._utils import getattr_str, trim_str
from collections import OrderedDict
#####
# Parameters
#####
DEFAULT_NAME = 'SCUBAT_v2.py'
VERSION = 2.10
DEFAULT_TRANSCRIPT_IDENTITY_CUTOFF = 95
DEFAULT_HSP_IDENTITY_CUTOFF = 95
DEFAULT_EXON_OVERLAP_CUTOFF = 80
DEFAULT_TRANSCRIPT_OVERLAP_CUTOFF = 50
DEFAULT_LIBRARY_INSERT_SIZE = 100
DEFAULT_TRANSCRIPT_COVERAGE_CUTOFF = 90
DEFAULT_TRANSCRIPT_COVERAGE_ONE_CONTIG = 70
DEFAULT_NUMBER_OF_NS = 100
parser = argparse.ArgumentParser(prog=DEFAULT_NAME,description='Scaffold contigs using Transcripts')
parser.add_argument('-b','--blastfile', dest='blastfile', required=True,
help='BLAST output in xml format [Required]')
parser.add_argument('-f','--fastafile', dest='fastafile', required=True,
help='Contig assembly in fasta format [Required]')
parser.add_argument('-hid','--hsp_identity_cutoff', type=int,
help='BLAST HSP identity cutoff [default: %(default)s]',default=DEFAULT_HSP_IDENTITY_CUTOFF)
parser.add_argument('-tid','--transcript_identity_cutoff', type=int,
help='BLAST transcript identity cutoff [default: %(default)s]',default=DEFAULT_TRANSCRIPT_IDENTITY_CUTOFF)
parser.add_argument('-tcov','--transcript_coverage_cutoff', type=int,
help='Transcript coverage cutoff [default: %(default)s]',default=DEFAULT_TRANSCRIPT_COVERAGE_CUTOFF)
parser.add_argument('-max','--maximum_intron_size', type=int, required=True,
help='Maximum intron size [Required]')
parser.add_argument('-eov','--exon_overlap_cutoff', type=int,
help='Exon overlap cutoff [default: %(default)s]',default=DEFAULT_EXON_OVERLAP_CUTOFF)
parser.add_argument('-tov','--transcript_overlap_cutoff', type=int,
help='Transcript overlap cutoff [default: %(default)s]',default=DEFAULT_TRANSCRIPT_OVERLAP_CUTOFF)
parser.add_argument('-lis','--library_insert_size', type=int,
help='Library insert size [default: %(default)s]',default=DEFAULT_LIBRARY_INSERT_SIZE)
parser.add_argument('-ns','--number_of_ns', type=int,
help='Number of Ns to add when merging [default: %(default)s]',default=DEFAULT_NUMBER_OF_NS)
parser.add_argument('-isr','--intron_size_run', dest='intron_size_run', action='store_true',
help='Estimate insert size based on the data')
parser.add_argument('--verbose', dest='verbose', action='store_true',
help='Print additional verbose files')
parser.add_argument('--version', action='version', version='%(prog)s v' + str(VERSION))
parser.set_defaults(verbose=False)
parser.set_defaults(intron_size_run=False)
if len(sys.argv)==1:
parser.print_help()
sys.exit(1)
args = parser.parse_args()
#####
# Classes
#####
####
# Class to contain hsps from different hits in one object
####
class Transcript:
def __init__(self, query_id='<unknown id>',query_length='<unknown length>'):
self.query_id = query_id
self.query_length = query_length
self.hsps = []
self._items = []
def add(self,hsp):
self.hsps.append(hsp)
def __iter__(self):
return iter(self.hsps)
def sort(self, key=None, reverse=False, in_place=True):
if in_place:
self.hsps.sort(key=key, reverse=reverse)
else:
hsps = self.hsps[:]
hsps.sort(key=key, reverse=reverse)
obj = self.__class__(hsps)
self._transfer_attrs(obj)
return obj
#print output of the object
def __str__(self):
lines =[]
qid_line = 'Query: %s' % self.query_id
qlength_line = 'Length: %i' % self.query_length
lines.append(qid_line)
lines.append(qlength_line)
if not self.hsps:
lines.append(' HSPs: ?')
else:
lines.append(' HSPs: %s %s %s %s %s %s %s %s %s %s' %
('-'*4, '-'*8, '-'*9, '-'*6, '-'*6, '-'*15, '-'*21, '-'*58, '-'*15, '-'*7))
pattern = '%11s %8s %9s %6s %6s %15s %21s %58s %15s %7s'
lines.append(pattern % ('#', 'E-value', 'Bit score', 'ID (%)','Span',
'Query range', 'Hit range', 'Contig', 'Contig Length', 'Strand'))
lines.append(pattern % ('-'*4, '-'*8, '-'*9, '-'*6, '-'*6, '-'*15, '-'*21, '-'*58, '-'*15, '-'*7))
for idx, hsp in enumerate(self.hsps):
# evalue
evalue = getattr_str(hsp, 'evalue', fmt='%.2g')
# bitscore
bitscore = getattr_str(hsp, 'bitscore', fmt='%.2f')
# identities
identity = getattr_str(hsp, 'identity', fmt='%4.2f')
# alignment length
aln_span = getattr_str(hsp, 'aln_span')
# query region
query_start = getattr_str(hsp, 'query_start')
query_end = getattr_str(hsp, 'query_end')
query_range = '[%s:%s]' % (query_start, query_end)
# max column length is 18
query_range = trim_str(query_range, 15, '~]')
# hit region
hit_start = getattr_str(hsp, 'hit_start')
hit_end = getattr_str(hsp, 'hit_end')
hit_range = '[%s:%s]' % (hit_start, hit_end)
hit_range = trim_str(hit_range, 21, '~]')
# contig id
contig_id = getattr_str(hsp, 'hit_id')
# contig length
contig_length = getattr_str(hsp, 'hit_length')
# strand
strand = getattr_str(hsp, 'strand')
# append the hsp row
lines.append(pattern % (str(idx), evalue, bitscore, identity, aln_span,
query_range, hit_range, contig_id, contig_length, strand))
return '\n'.join(lines)
####
# Class that contains the information for each hsp
# Named HSPG to avoid conflict with class HSP of biopython module
####
class Hspg:
def __init__(self,evalue,bitscore,identity,aln_span,query_start,query_end,hit_start,hit_end,hit_id,hit_length, strand):
self.evalue = evalue
self.bitscore = bitscore
self.identity = identity
self.aln_span = aln_span
self.query_start = query_start
self.query_end = query_end
self.hit_start = hit_start
self.hit_end = hit_end
self.hit_id = hit_id
self.hit_length = hit_length
self.strand = strand
def _query_range_get(self):
return (self.query_start, self.query_end)
query_range = property(fget=_query_range_get,
doc="""Tuple of query start and end coordinates""")
#####
# functions
#####
####
# remove hsps if they overlap
# choose the one with the highest bitscore
# (primarly to remove hsps in a pseudogene area in the same contig)
####
def removeOverlappingHsps( hit ):
sort_key = lambda hsp: hsp.query_range
hit.sort(key=sort_key, reverse=False, in_place=True)
k = 0
while k < len(hit)-1:
x = range(hit.hsps[k].query_start,hit.hsps[k].query_end)
y = range(hit.hsps[k+1].query_start,hit.hsps[k+1].query_end)
d = range(max(x[0], y[0])-1, min(x[-1], y[-1])+1)
minimum_length = min(len(x),len(y))
diff = float(len(d)) / (minimum_length+1)
if diff > round(float(args.exon_overlap_cutoff)/100,2):
if hit.hsps[k].bitscore <= hit.hsps[k+1].bitscore:
hit.hsps.pop(k)
k -= 1
else:
hit.hsps.pop(k+1)
k -= 1
k += 1
####
# remove hits if they overlap
# choose the longest one
# (primarly to remove hits in a pseudogene/domain area in other contigs)
####
def removeOverlappingHits (qresult):
sort_key = lambda hit: len(hit)
qresult.sort(key=sort_key, reverse=True, in_place=True)
k = 0
remove_k = 0
while k < len(qresult) -1:
x = range(qresult.hits[k].hsps[0].query_start,qresult.hits[k].hsps[-1].query_end)
i = k + 1
done = 0
while i < len(qresult) and done==0:
y = range(qresult.hits[i].hsps[0].query_start,qresult.hits[i].hsps[-1].query_end)
d = range(max(x[0], y[0])-1, min(x[-1], y[-1])+1)
minimum_length = min(len(x),len(y))
diff = float(len(d)) / (minimum_length+1)
if diff > round(float(args.transcript_overlap_cutoff)/100,2):
if len(x) < len(y):
remove_k = 1
done = 1
else:
qresult.pop(i)
i -= 1
i += 1
if remove_k == 1:
qresult.pop(k)
k -= 1
remove_k = 0
k += 1
####
# percentage status calculator
####
def disp_status(prefix,processed,total):
if processed and total:
percent = round(100 * (float(processed)/float(total)),2)
sys.stdout.write("\r")
sys.stdout.write(prefix+"%4.2f%% " % percent)
sys.stdout.flush()
####
# calculate intron size
####
def propable_intron_size(hit_start_first,hit_end_first,orientation_first,contig_length_first,hit_start_second,hit_end_second,orientation_second,contig_length_second):
if (orientation_first == "+" and orientation_second == "+"):
p_intron_size = (contig_length_first - hit_end_first) + hit_start_second
elif (orientation_first == "+" and orientation_second == "-"):
p_intron_size = (contig_length_first - hit_end_first) + (contig_length_second - hit_end_second)
elif (orientation_first == "-" and orientation_second == "-"):
p_intron_size = hit_start_first + (contig_length_second - hit_end_second)
elif (orientation_first == "-" and orientation_second == "+"):
p_intron_size = hit_start_first + hit_start_second
return p_intron_size
####
# create the path
####
def getPath(nodes, start):
path = [start]
while nodes.has_key(path[-1]):
path.append(nodes[path[-1]])
return path
####
# return the correct orientation of the contig
####
def getContig(ctgs, node):
if node[-1] == 'f':
return ctgs[node[:-2]]
else:
return str(Seq(ctgs[node[:-2]]).reverse_complement())
#####
# Begin
#####
localtime = time.asctime( time.localtime(time.time()) )
print 'Program started: ' + localtime
start = time.time()
number_of_hits =subprocess.check_output(['grep','-c', '<Iteration>', args.blastfile])
h = open('intron_size.txt', 'w')
print >>h, '#intron_size \t transcript_id'
#debug = open('debug.txt', 'w')
if not args.intron_size_run and args.verbose:
e = open('filtered_blast_output.txt', 'w')
v = open('Cyto_table_verbose.sif', 'w')
p = open('prob_intron.txt', 'w')
c = open('connection_removed.txt', 'w')
tr = open('toremove.dict.txt', 'w')
qcovfile = open('transcript_status.txt', 'w')
print >>qcovfile, '#transcript_id' + '\t' + 'transcript_coverage' + '\t' + 'identity_percentage' + '\t' + 'status'
all_connections = open('all_connections.txt', 'w')
unmapped_transcripts_file = open('unmapped_transcripts.txt', 'w')
if not args.intron_size_run:
scaffold_file = open('SCUBAT_scaffolds.fasta', 'w')
confiledict = open('connections.dict.txt', 'w')
s = open('SCUBAT_stats.txt', 'w')
####
#Process BLAST xml
####
no_hit = []
uninformative = []
informative = []
same_contig = []
n = 0
contig_check_inner = {};
contig_check_outer = {};
for qresult in SearchIO.parse(args.blastfile, 'blast-xml'):
for hit in qresult:
removeOverlappingHsps( hit )
removeOverlappingHits(qresult)
if len(qresult.hits) == 0:
no_hit.append(qresult)
if not args.intron_size_run and args.verbose:
print >>unmapped_transcripts_file, qresult.id
elif len(qresult.hits) == 1:
if len(hit.hsps) == 1:
uninformative.append(qresult)
else:
same_contig.append(qresult)
if (hit.hsps[0].hit_start < hit.hsps[-1].hit_end):
contig_check_start = hit.hsps[0].hit_start
contig_check_end = hit.hsps[-1].hit_end
else:
contig_check_start = hit.hsps[-1].hit_start
contig_check_end = hit.hsps[0].hit_end
if hit.id in contig_check_inner:
if contig_check_start < contig_check_inner[hit.id]:
contig_check_inner[hit.id] = contig_check_end
elif contig_check_end > contig_check_outer[hit.id]:
contig_check_outer[hit.id] = contig_check_start
else:
pass
else:
contig_check_inner[hit.id] = contig_check_end
contig_check_outer[hit.id] = contig_check_start
else:
transcript = Transcript(qresult.id,qresult.seq_len)
identity_number = 0
alignment_hsp = 0
for hit in qresult:
for hsp in hit:
identity_hsp = round(100*(float(hsp.ident_num)/hsp.aln_span),2)
if identity_hsp >= args.hsp_identity_cutoff:
identity_number += hsp.ident_num
alignment_hsp += hsp.aln_span
hsp_t = Hspg(hsp.evalue,hsp.bitscore,identity_hsp,hsp.aln_span,hsp.query_start,hsp.query_end,hsp.hit_start,hsp.hit_end,hit.id,hit.seq_len,hsp.hit_strand)
transcript.add(hsp_t)
sort_key = lambda hsp: hsp.query_range
transcript.sort(key=sort_key, reverse=False, in_place=True)
k = 0
qcov = []
while k < len(transcript.hsps):
qcov.extend(range(transcript.hsps[k].query_start,transcript.hsps[k].query_end+1))
k += 1
qcov = list(set(qcov))
pers = round(100*(float(len(qcov))/transcript.query_length),2)
# Change to 1 in case of no hsps passing the identity filter
if alignment_hsp == 0:
alignment_hsp = 1
identity = round(100*(float(identity_number)/alignment_hsp),2)
if (pers >= args.transcript_coverage_cutoff and identity >= args.transcript_identity_cutoff):
informative.append(transcript)
if not args.intron_size_run and args.verbose:
print >>qcovfile, transcript.query_id + '\t' + str(pers) + '\t' + str(identity)
else:
if not args.intron_size_run and args.verbose:
print >>qcovfile, transcript.query_id + '\t' + str(pers) + '\t' + str(identity) + '\t' + 'REMOVED'
n += 1
disp_status("Processing BLAST xml...",n,number_of_hits)
print ''
if not args.intron_size_run and args.verbose:
qcovfile.close()
#####
# Calculate stats for intron sizes and exon overlaps
#####
exon_overlaps = []
intron_sizes = []
n = 0
for qresult in same_contig:
for hit in qresult:
k = 0
while k < len(hit)-1:
if (hit.hsps[k+1].query_start-hit.hsps[k].query_end)<1:
overlap_diff = abs(hit.hsps[k+1].query_start-hit.hsps[k].query_end)+1
exon_overlaps.append(overlap_diff)
intron_size = abs(hit.hsps[k+1].hit_start-hit.hsps[k].hit_end)+1
intron_sizes.append(intron_size)
h.write(str(intron_size)+ '\t' + hit.query_id +'\n')
k += 1
n += 1
disp_status("Calculating stats...",n,len(same_contig))
print ''
#h.close()
#####
# Create connections
#####
n = 0
orientation = {}
conn_removed = 0
connections=[]
connections_dict={}
to_remove=[]
for transcript in informative:
k=0
if not args.intron_size_run and args.verbose:
print >>e, transcript
while k < len(transcript.hsps) -1:
flag=1
if (transcript.hsps[k].hit_id == transcript.hsps[k+1].hit_id):
intron_size = abs(transcript.hsps[k+1].hit_start-transcript.hsps[k].hit_end)+1
intron_sizes.append(intron_size)
h.write(str(intron_size)+ '\t' + hit.query_id +'\n')
else:
if (transcript.hsps[k].strand == 1 or transcript.hsps[k].strand == '+'):
transcript.hsps[k].strand='+'
if transcript.hsps[k].hit_id in contig_check_outer and transcript.hsps[k].hit_end < contig_check_outer[transcript.hsps[k].hit_id]:
flag=0
else:
pass
else:
transcript.hsps[k].strand='-'
if transcript.hsps[k].hit_id in contig_check_inner and transcript.hsps[k].hit_start > contig_check_inner[transcript.hsps[k].hit_id]:
flag=0
else:
pass
if (transcript.hsps[k+1].strand == 1 or transcript.hsps[k+1].strand == '+'):
transcript.hsps[k+1].strand='+'
if transcript.hsps[k+1].hit_id in contig_check_inner and transcript.hsps[k+1].hit_start > contig_check_inner[transcript.hsps[k+1].hit_id]:
flag=0
else:
pass
else:
transcript.hsps[k+1].strand='-'
if transcript.hsps[k+1].hit_id in contig_check_outer and transcript.hsps[k+1].hit_end < contig_check_outer[transcript.hsps[k+1].hit_id]:
flag=0
else:
pass
if not args.intron_size_run:
p_intron = propable_intron_size(transcript.hsps[k].hit_start,transcript.hsps[k].hit_end,transcript.hsps[k].strand,transcript.hsps[k].hit_length,transcript.hsps[k+1].hit_start,transcript.hsps[k+1].hit_end,transcript.hsps[k+1].strand,transcript.hsps[k+1].hit_length)
if args.verbose:
p.write(str(p_intron) + '\t' + transcript.query_id + '\n')
if ((p_intron+args.library_insert_size) < args.maximum_intron_size and flag):
if args.verbose:
v.write(transcript.hsps[k].hit_id + '\t' + transcript.query_id + '\t' + transcript.hsps[k+1].hit_id + '\t' + transcript.hsps[k].strand + transcript.hsps[k+1].strand+ '\n')
if (transcript.hsps[k].strand == "+" and transcript.hsps[k+1].strand == "+"):
connections.append(transcript.hsps[k].hit_id + '/f' + '\t' + transcript.hsps[k+1].hit_id + '/f')
connections.append(transcript.hsps[k+1].hit_id + '/r' + '\t' + transcript.hsps[k].hit_id + '/r')
elif (transcript.hsps[k].strand == "+" and transcript.hsps[k+1].strand == "-"):
connections.append(transcript.hsps[k].hit_id + '/f' + '\t' + transcript.hsps[k+1].hit_id + '/r')
connections.append(transcript.hsps[k+1].hit_id + '/f' + '\t' + transcript.hsps[k].hit_id + '/r')
elif (transcript.hsps[k].strand == "-" and transcript.hsps[k+1].strand == "+"):
connections.append(transcript.hsps[k].hit_id + '/r' + '\t' + transcript.hsps[k+1].hit_id + '/f')
connections.append(transcript.hsps[k+1].hit_id + '/r' + '\t' + transcript.hsps[k].hit_id + '/f')
else:
connections.append(transcript.hsps[k].hit_id + '/r' + '\t' + transcript.hsps[k+1].hit_id + '/r')
connections.append(transcript.hsps[k+1].hit_id + '/f' + '\t' + transcript.hsps[k].hit_id + '/f')
else:
if args.verbose:
c.write(transcript.hsps[k].hit_id + '\t' + transcript.query_id + '\t' + transcript.hsps[k+1].hit_id + '\t' + str(p_intron) + '\n')
conn_removed += 1
k += 1
n += 1
disp_status("Creating connections...",n,len(informative))
print ''
if args.intron_size_run:
sys.exit()
if args.verbose:
e.close()
p.close()
v.close()
c.close()
unmapped_transcripts_file.close()
transcripts_passed = 0
transcripts_failed = 0
for transcript in informative:
pass_flag = 0 ;
transcript_span = {}
for hsp in transcript:
#transcript_span[hsp.hit_id] += hsp.aln_span
transcript_span[hsp.hit_id] = transcript_span.get(hsp.hit_id, 0) + hsp.aln_span
for hit_id in transcript_span:
if ((transcript_span[hit_id]/transcript.query_length)*100) >= DEFAULT_TRANSCRIPT_COVERAGE_ONE_CONTIG:
pass_flag = 1 ;
if pass_flag == 1:
transcripts_passed += 1;
else:
transcripts_failed += 1;
connections = list(set(connections))
if args.verbose:
for conn in connections:
print >>all_connections, conn
all_connections.close()
precs=[]
sucs=[]
for conn in connections:
pre,suc=conn.split('\t')
precs.append(pre)
sucs.append(suc)
precs_duplicates= list (set([x for x in precs if precs.count(x) > 1]))
sucs_duplicates= list (set([x for x in sucs if sucs.count(x) > 1]))
if args.verbose:
print >>tr, "#precs"
for trm in precs_duplicates:
print >>tr, trm
print >>tr, "#sucs"
for trm in sucs_duplicates:
print >>tr, trm
print >>tr, "#connections removed"
nodes = {}
start_nodes = set()
all_nodes = set()
for conn in connections:
pre,suc = conn.split('\t')
if (pre in precs_duplicates or suc in sucs_duplicates):
if args.verbose:
print >>tr, conn
else:
nodes[pre] = suc
start_nodes.add(pre)
all_nodes.add(pre[:-2])
all_nodes.add(suc[:-2])
if args.verbose:
tr.close()
start_nodes -= set(nodes.values())
paths = []
contigs = {}
for record in SeqIO.parse(args.fastafile, "fasta") :
contigs[record.id] = str(record.seq)
string_of_ns = "N" * args.number_of_ns
i = 0;
for start_node in start_nodes:
path=getPath(nodes, start_node)
if path[0] > path[-1]:
i += 1
print >>confiledict, path
scaffold_file.write('>Path_' + str(i) +'_' + str(path) + '\n' + string_of_ns.join([getContig(contigs,node) for node in path]) + '\n')
for contig_id in contigs.keys():
if contig_id not in all_nodes:
scaffold_file.write('>' + contig_id + '\n' + contigs[contig_id] + '\n')
confiledict.close()
scaffold_file.close()
# END OF SCAFFOLDING
transcripts_passed_number = len(uninformative) + len(same_contig) + transcripts_passed
# print the stats
s.write('Program was called as:' + '\n')
s.write(DEFAULT_NAME + '\n')
s.write(' -b ' + '\t' + args.blastfile + '\n')
s.write(' -f ' + '\t' + args.fastafile + '\n')
s.write(' -hid ' + '\t' + str(args.hsp_identity_cutoff) + '\n')
s.write(' -tid ' + '\t' + str(args.transcript_identity_cutoff) + '\n')
s.write(' -tcov ' + '\t' + str(args.transcript_coverage_cutoff) + '\n')
s.write(' -max ' + '\t' + str(args.maximum_intron_size) + '\n')
s.write(' -eov ' + '\t' + str(args.exon_overlap_cutoff) + '\n')
s.write(' -tov ' + '\t' + str(args.transcript_overlap_cutoff) + '\n')
s.write(' -lis ' + '\t' + str(args.library_insert_size) + '\n')
s.write(' -ns ' + '\t' + str(args.number_of_ns) + '\n')
s.write('---------------------------------------------' + '\n')
s.write('Transcripts with no hits:' + str(len(no_hit)) + '\n')
s.write('Transcripts with 1 hsp:' + str(len(uninformative)) + '\n')
s.write('Transcripts hitting 1 contig with multiple hsps:' + str(len(same_contig)) + '\n')
s.write('Transcripts hitting multiple contigs:' + str(len(informative)) + '\n')
s.write('Intron_size Mean:' + str(numpy.array(intron_sizes).mean(axis=0)) + '\t' + 'Median:' + str(numpy.median(numpy.array(intron_sizes), axis=0)) + '\t' + 'SD:' + str(numpy.array(intron_sizes).std(axis=0)) + '\n')
s.write('Exon_Overlap Mean:' + str(numpy.array(exon_overlaps).mean(axis=0)) + '\t' + 'Median:' + str(numpy.median(numpy.array(exon_overlaps), axis=0)) + '\t' + 'SD:' + str(numpy.array(exon_overlaps).std(axis=0)) + '\n')
s.write('Connections removed based on max intron size:' + str(conn_removed) + '\n')
s.write('---------------------------------------------' + '\n')
s.write('---------------------------------------------' + '\n')
s.write('Transcripts with no hits:' + str(len(no_hit)) + '\n')
s.write('Transcripts more than ' + str(DEFAULT_TRANSCRIPT_COVERAGE_ONE_CONTIG) + ' in one contig:' + str(transcripts_passed_number) + '\n')
s.write('Transcripts less than ' + str(DEFAULT_TRANSCRIPT_COVERAGE_ONE_CONTIG) + ' in one contig:' + str(transcripts_failed) + '\n')
s.close()
#####
# Ending prints
#####
localtime = time.asctime( time.localtime(time.time()) )
end = time.time()
print 'Program ended: ' + localtime
seconds = round((end - start),3)
minutes = round((seconds/60),3)
hours = round((minutes/60),3)
print 'SCUBATv2 finished in ' + str(seconds) + ' seconds or ' + str(minutes) + ' minutes or ' + str(hours) + ' hours'