forked from schneiderthomas/OTA-pipeline
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclipPrimers_both_ends.py
1250 lines (995 loc) · 52.5 KB
/
clipPrimers_both_ends.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/python
import pysam
import csv
import Bio.Seq
import regex
import sys, getopt
# http://www.tutorialspoint.com/python/python_command_line_arguments.htm
# Returns "Start" if You need to look for the primer at the "start" of the read, returns "End" if you need to look for the primer at the end of the read
# parameters are passed by reference by default
#FOR REFERENCE NOT USING ANYMORE
def where_to_look_for_primer(selectedread):
if selectedread.is_reverse:
if selectedread.is_read1:
currentPrimerList = end_pos
else:
currentPrimerList = end_neg
else:
if selectedread.is_read1:
currentPrimerList = start_neg
else:
currentPrimerList = start_pos
return currentPrimerList
#FOR REFERENCE NOT USING ANYMORE
def getmatch(selectedread, primerList, target_portion_read):
count_end = len(primerList)-1
count = 0
endreadmatchlength = None
returnlist = None
foundPrimerLength = None
while (count <= count_end):
# CHECKING IF THE READ IS REVERSED, IF IT IS REVERSED THEN ENDING SEQUENCE CONTAINS THE PRIMER
# IF THE READ IS NOT REVERSED THEN THE BEGINNED SEQUENCE CONTAINS THE PRIMER
currentPrimer = primerList[count]
if selectedread.is_reverse:
endPrimer = currentPrimer[::-1]
endtestprimer = "(" + endPrimer + ")" + "{e<=2}"
end_reverse_read = target_portion_read[::-1]
match = regex.match(endtestprimer, end_reverse_read)
else:
# searching the beginning, with allowable error of 2 or less
testprimer = "(" + currentPrimer + ")" + "{e<=2}"
match = regex.match(testprimer, target_portion_read)
# index_of_primer_start = read.query_sequence.find(currentPrimer,0,40)
# if index_of_primer != -1:
if match != None:
count = count_end + 1
endreadmatchlength = match.span()[1]
foundPrimerLength = len(currentPrimer)
returnlist = [endreadmatchlength, foundPrimerLength]
else:
if count != count_end:
count = count + 1
else:
# THIS MEANS THERE IS NO MATCHES AND THEREFORE UNSURE WHAT PRIMER USED AND SO CUTTING OFF THE LENGTH OF THE LARGEST PRIMER WHICH IS THIRTY
count = count + 1
return returnlist
def getmatchall(readforward, readreverse, PrimerDictList, usePosStrand, target_portion_read_forward, target_portion_read_reverse):
# forward and reverse fasta files should be the same length
currentChr=readforward.tid
currentPos=readforward.pos
count_end = len(PrimerDictList)-1
count = 0
endreadmatchlengthforward = None
endreadmatchlengthreverse = None
returnlist = None
foundPrimerLengthforward = None
foundPrimerLengthreverse = None
while (count <= count_end):
if PrimerDictList[count]["strand"] == '-' and usePosStrand:
count= count +1
continue
elif int(PrimerDictList[count]["chr"]) != currentChr:
count= count +1
continue
elif abs(int(PrimerDictList[count]["startPos"])-currentPos) > 400:
count= count + 1
continue
# CHECKING IF THE READ IS REVERSED, IF IT IS REVERSED THEN ENDING SEQUENCE CONTAINS THE PRIMER
# IF THE READ IS NOT REVERSED THEN THE BEGINNED SEQUENCE CONTAINS THE PRIMER
currentPrimerforward = PrimerDictList[count]["currentStartPrimer"]
currentPrimerreverse = PrimerDictList[count]["currentEndPrimer"]
# if selectedread.is_reverse:
endPrimerreverse = currentPrimerreverse[::-1]
testprimerreverse = "(" + endPrimerreverse + ")" + "{e<=2}"
end_reverse_read = target_portion_read_reverse[::-1]
matchreverse = regex.match(testprimerreverse, end_reverse_read)
# else:
# searching the beginning, with allowable error of 2 or less
testprimerforward = "(" + currentPrimerforward + ")" + "{e<=2}"
matchforward = regex.match(testprimerforward, target_portion_read_forward)
# index_of_primer_start = read.query_sequence.find(currentPrimer,0,40)
# if index_of_primer != -1:
if matchreverse != None and matchforward != None:
targetLength= PrimerDictList[count]["targetLength"]
startPos = PrimerDictList[count]["startPos"]
endPos = PrimerDictList[count]["endPos"]
count = count_end + 1
endreadmatchlengthforward = matchforward.span()[1]
endreadmatchlengthreverse = matchreverse.span()[1]
foundPrimerLengthforward = len(currentPrimerforward)
foundPrimerLengthreverse = len(currentPrimerreverse)
returnlist = [endreadmatchlengthforward, foundPrimerLengthforward, endreadmatchlengthreverse, foundPrimerLengthreverse,targetLength,startPos,endPos]
elif matchreverse != None:
#print "Reverse matched but forward did not"
#print "The read forward is below"
#print readforward
#print target_portion_read_forward
#print "The read reverse is below"
#print readreverse
#print target_portion_read_reverse
#print "The fasta file forward is below"
#print primerListforward.filename
#print "The fasta file reverse is below"
#print primerListreverse.filename
count = count + 1
elif matchforward != None:
#print "Forward matched but reverse did not"
#print "The read forward is below"
#print readforward
#print target_portion_read_forward
#print "The read reverse is below"
#print readreverse
#print target_portion_read_reverse
#print "The fasta file forward is below"
#print primerListforward.filename
#print "The fasta file reverse is below"
#print primerListreverse.filename
count = count + 1
else:
count = count + 1
return returnlist
def change_cigar_either(booleanIsReverse,originalcigar, endreadmatchlength, numberlist, numberlistint, letterlist, expectedEndReadMatchLength, readLength, read,targetLength,startManifestPos,endManifestPos):
#returns list of two indexes one says if there was an error making the cigar and other says if there is overlapping of reads
overlappingReads=False
# because changing read.position
global forwardread
global reverseread
oldAlignmentStart = 0
# print sum(numberlistint)
###DEBUGGING
#if forwardread.qname != "NS500796:7:H77VTAFXX:1:11101:7184:4318":
#return True
# print 'Found M01382:52:000000000-AAECF:1:2102:21256:8889'
# print 'The endreadmatchlength is ' + str(endreadmatchlength)
# IF IT IS NOT LESS THEN OR EQUAL TO endreadmatchlength
indexchanging = 0
deletionPresent = False
# numberToSubtract is dependent on whether or not there is a deletion present in the primer area, need to adjust position by the number of deletions present
# (shouldn't be more than 2 since only 2 deletions are allowed, if more then read should be thrown out)
# numberToSubtract and InsertionCount, only works if in the primer
DeletionCount = 0
DeletionCountExceptEnd=0
InsertionCount = 0
insertionAtEnd = 0
booleanDeletionAtEnd = False
booleanInsertionAtEnd = False
runningTotal = 0
DeletionPos = 0
if booleanIsReverse:
numberlist = numberlist[::-1]
numberlistint = numberlistint[::-1]
letterlist = letterlist[::-1]
oldAlignmentStartR = reverseread.qstart
oldPositionStart = reverseread.pos
#need to do this for reverse
#WILL DO POSITION CHECK LATER FOR REVERSE
#if endManifestPos-oldPositionStart-reverseread.qlen > 0:
# print read.qname
# print "The position is " + str(oldPositionStart)
# print "The q length is " + str(reverseread.qlen)
#when there is a deletion (ie there is less of the probe because the position start is different)
#the oldPositionStart is actually smaller and hence it will be > 0 since normal is zero
#print "The reverse read position is " + str(reverseread.pos)
#print "The reverse read position is " + str(reverseread.pos+reverseread.qlen)
#print "The reverse read qlen is " + str(reverseread.qlen)
#print "The reverse read qstart is " + str(reverseread.qstart)
#FOR REV
# DeletionCount= DeletionCount + abs(endManifestPos-oldPositionStart-reverseread.qlen)
# DeletionPos=DeletionCount
#elif endManifestPos-oldPositionStart-reverseread.qlen < 0:
# InsertionCount= InsertionCount + (endManifestPos-oldPositionStart-reverseread.qlen)
#else:
# return True
#print str(endManifestPos-forwardread.pos)
else:
oldAlignmentStart = forwardread.qstart
oldPositionStart = forwardread.pos
softclipbeginning = 0
if letterlist[0]=='S':
softclipbeginning = int(numberlist[0])
forwardreadOffset = startManifestPos - forwardread.pos + softclipbeginning
if forwardreadOffset != 1 and (forwardreadOffset > 4 and forwardreadOffset < -2):
#ie maps position "early" because skips first read on probe (like a deletion) or maps late because it added on a read
#if too match then too much error in the probe
return [True,overlappingReads]
if forwardreadOffset > 1:
#print "Hello insertion!!!!!!!!!!!!!!!!!!!"
#print str(startManifestPos - forwardread.pos)
InsertionCount= InsertionCount + (startManifestPos - forwardread.pos - 1)
#the expectedpostionforward - forwardread.pos should equal 1 expected, because forwardread.pos is base 0? if difference < 1 deletion, if > 1 insertion
if forwardreadOffset < 1:
#ie maps position "early" because skips first read on probe (like a deletion)
#print "Hello"
#print str(startManifestPos - forwardread.pos)
#need to add on DeletionPos makes it work (if not get cigar error where 25S125M when should be 25S126M)
#DeletionPos will not be used to modify the position
DeletionCount= DeletionCount + abs(startManifestPos - forwardread.pos - 1)
DeletionPos=DeletionCount
#runningTotal=runningTotal+DeletionCount
# #THERE MAY BE A SITUATION IF THERE IS TANDEM DUPLICATION shortly after a probe and the duplicated entry gets a match even though THOUGH THERE ARE TOO MANY DELETIONS, NEED TO GET RID OF THIS SCENARIO BECAUSE ONLY EXPECTING 2 deletions at most
if not sum(numberlistint) <= expectedEndReadMatchLength:
for index in range(len(numberlist)):
#
if expectedEndReadMatchLength <= numberlistint[index] + runningTotal:
indexchanging = index
runningTotal = runningTotal + numberlistint[index]
if letterlist[index] == 'D' or letterlist[index] == 'H' or letterlist[index] == 'N' or letterlist[index] == 'P':
deletionPresent = True
booleanDeletionAtEnd = True
DeletionCountExceptEnd = DeletionCount
DeletionCount = DeletionCount + (runningTotal - expectedEndReadMatchLength) + 1
runningtotalBeforeDeletion = runningTotal - numberlistint[index]
if letterlist[index] == 'I':
insertionAtEnd = expectedEndReadMatchLength - (runningTotal - numberlistint[index])
if DeletionCount + InsertionCount + insertionAtEnd >= 3:
return [True,overlappingReads]
break
else:
if letterlist[index] == 'D' or letterlist[index] == 'H' or letterlist[index] == 'N' or letterlist[index] == 'P':
deletionPresent = True
DeletionCount = DeletionCount + numberlistint[index]
elif letterlist[index] == 'I':
InsertionCount = InsertionCount + numberlistint[index]
runningTotal = runningTotal + numberlistint[index]
else:
return [True,overlappingReads]
#The position modification is not valid if there is a deletion in the primer when a reverse read, hence need to check if a deletion is present in the reverse
#read
if booleanIsReverse:
#get total Deletions and total insertions
totalDeletions=0
totalInsertions=0
totalSoftClip=0
for index in range(len(numberlist)):
if letterlist[index] == 'D':
totalDeletions = totalDeletions + numberlistint[index]
elif letterlist[index] == 'I':
totalInsertions = totalInsertions + numberlistint[index]
elif letterlist[index] == 'S':
totalSoftClip == totalSoftClip + numberlistint[index]
reversereadOffset= endManifestPos-oldPositionStart-reverseread.qlen - totalDeletions + totalInsertions - totalSoftClip
if abs(reversereadOffset) > 2:
#if too match then too much error in the probe
return [True,overlappingReads]
if reversereadOffset > 0:
DeletionCount= DeletionCount + abs(reversereadOffset)
DeletionPos=abs(reversereadOffset)
#if reversereadOffset > 2:
# print "Oh no!"
# print read.qname
# print read.cigarstring
# print "below is the results"
# print str(reversereadOffset)
# print "totalSoftClip is " + str(totalSoftClip)
# print "the chr location is " + str(read.tid)
# print "the location is " + str(endManifestPos)
elif reversereadOffset < 0:
#print "We are in endManifestPos-oldPositionStart-reverseread.qlen - totalDeletions + totalInsertions"
#print read.qname
InsertionCount= InsertionCount + abs(endManifestPos-oldPositionStart-reverseread.qlen - totalDeletions + totalInsertions -totalSoftClip)
# check if deletion is present in the primer (if so need to adjust cigar so the cigar length is what is expected, need to substract from not the primer)
# ie if 100M1D21M, need it to be 94M27S not 95M27S
# if expected endreadmatchlength do not match that means there was either a 1 or 2 bp deletion or 1 or 2 bp insertion
# need to subtract or add on to the last
# IF INSERTION NEED TO MOVE POSITION BACK INSTEAD OF FORWARD
#TO MAKE THINGS EASIER
#numberToSubtractTotal = DeletionCount + InsertionCount + insertionAtEnd
TotalInsertionsPresent = InsertionCount + insertionAtEnd
# #PLEASENOTE: INSERTIONATEND DOES NOT CHANGE POSITION
newcigar = ""
newcigarnumberlist = []
newcigarletterlist = []
needtomodifynumber = False
numberToSubtractTotal = 0
for index in range(len(numberlist)):
if index == 0:
# newcigar=newcigar+str(expectedEndReadMatchLength)+'S'
# newcigarnumberlist.append(str(expectedEndReadMatchLength-numberToSubtract))
if booleanDeletionAtEnd == True:
newcigarnumberlist.append(str(runningtotalBeforeDeletion + InsertionCount -DeletionCountExceptEnd))
else:
#newcigarnumberlist.append(str(expectedEndReadMatchLength + InsertionCount -DeletionCount))
newcigarnumberlist.append(str(expectedEndReadMatchLength + InsertionCount -DeletionCount))
newcigarletterlist.append('S')
if indexchanging == 0 and not (letterlist[indexchanging] == 'S' or letterlist[indexchanging] == 'H'):
# newcigar=newcigar+str(runningTotal-expectedEndReadMatchLength)+letterlist[indexchanging]
##TODO MAKE SUBFUNCTION AS CALLED TWICE
newnumber= runningTotal - (expectedEndReadMatchLength + InsertionCount) + DeletionPos
if newnumber > 0:
newcigarnumberlist.append(str(newnumber))
newcigarletterlist.append(letterlist[indexchanging])
elif newnumber == 0:
continue
else:
numberToSubtractTotal = abs(newnumber)
needtomodifynumber= True
###END TODO
# else just keep the position the same
else:
if indexchanging == 0:
# IF THE CURRENT S or H is larger then the primer then get rid of the primer length added to the newcigarnumberlist
del newcigarnumberlist[len(newcigarnumberlist) - 1]
newcigarnumberlist.append(str(numberlistint[0]))
elif index < indexchanging:
continue
elif index == indexchanging:
if letterlist[indexchanging] == 'D' or letterlist[indexchanging] == 'H':
#print 'I am trying to modify a read with a deletion at the end of the primer'
#print 'The read is ' + str(read.qname)
needtomodifynumber = True
booleanDeletionAtEnd = True
newnumber= runningTotal - (expectedEndReadMatchLength + InsertionCount) + DeletionPos
#print 'The newnumber is ' + str(newnumber)
if newnumber > 0:
newcigarnumberlist.append(str(newnumber))
newcigarletterlist.append(letterlist[indexchanging])
else:
numberToSubtractTotal = abs(newnumber)
# DO NOTHING, THE DELETION LETTER WILL BE REMOVES (ONLY TWO SCENARIOS REMEMBER CAN'T HAVE MORE THAN 2 INS OR DEL
elif letterlist[indexchanging] == 'I':
#NOTE: WE ARE NOT MODIFYING IF THERE IS AN INSERTION AT THE END
needtomodifynumber = True
booleanInsertionAtEnd = True
numberToSubtractTotal = TotalInsertionsPresent
newcigarnumberlist.append(numberlist[indexchanging])
newcigarletterlist.append(letterlist[indexchanging])
else:
##TODO MAKE SUBFUNCTION AS CALLED TWICE
newnumber= runningTotal - (expectedEndReadMatchLength + InsertionCount) + DeletionPos
if newnumber > 0:
newcigarnumberlist.append(str(newnumber))
newcigarletterlist.append(letterlist[indexchanging])
#numberToSubtractTotal = abs(DeletionCount)
#needtomodifynumber= True
elif newnumber == 0:
continue
else:
numberToSubtractTotal = abs(newnumber)
needtomodifynumber= True
##END TODO
else:
if needtomodifynumber and letterlist[index] != 'D' and letterlist[index] !='I':
newnumber = numberlistint[index] - numberToSubtractTotal
if newnumber > 0:
newcigarnumberlist.append(str(newnumber))
newcigarletterlist.append(letterlist[index])
needtomodifynumber = False
else:
numberToSubtractTotal = numberToSubtractTotal - numberlistint[index] + 1
newcigarnumberlist.append(str(1))
newcigarletterlist.append(letterlist[index])
elif needtomodifynumber and letterlist[index] == 'D' and InsertionCount >0:
#THIS IS A SPECIFIC SITUATION AND NEED TO ADD TO DELETIONCOUNT IN ORDER TO PROPERLY POSITION THE READ IF IT IS FORWARD
DeletionCount = DeletionCount + 1
newnumber = numberlistint[index] - numberToSubtractTotal
if newnumber > 0:
newcigarnumberlist.append(str(newnumber))
newcigarletterlist.append(letterlist[index])
needtomodifynumber = False
elif newnumber == 0:
#KNOW THAT THIS IS A CERTAIN SITUATION IN WHICH WILL WANT TO REDUCE SIZE OF CLIP
newcigarnumberlist[len(newcigarletterlist)-1]=str(int(newcigarnumberlist[len(newcigarletterlist)-1])-1)
needtomodifynumber = False
else:
numberToSubtractTotal = numberToSubtractTotal - numberlistint[index] + 1
newcigarnumberlist.append(str(1))
newcigarletterlist.append(letterlist[index])
else:
newcigarnumberlist.append(numberlist[index])
newcigarletterlist.append(letterlist[index])
if newcigarletterlist[0] != 'S':
print "there was an error and a cigar was not clipped"
return [True,overlappingReads]
dontClipEnd=False
numbertoClip=0
indexchangingAll=len(newcigarnumberlist)-1
deletionEnd=0
insertionEnd=0
softEnd=0
matchEnd=0
foundIndexChanged=False
#print "Started new end read clip"
if sum(numberlistint) > expectedEndReadMatchLength + targetLength:
numbertoClip = 0-targetLength
#starting at one because not wanting to count the probe that was soft clipped
for index in range(1,len(newcigarnumberlist)):
if numbertoClip < 0:
if (newcigarletterlist[index] != 'D') and (newcigarletterlist[index] != 'I') and (newcigarletterlist[index] != 'S') and (newcigarletterlist[index] != 'H') and (newcigarletterlist[index] != 'N') and (newcigarletterlist[index] != 'P'):
numbertoClip=numbertoClip+int(newcigarnumberlist[index])
#because less of target will be covered if there is an insertion, DO NOT DO ANYTHING IF INSERTION (THE NUMBER OF MATCHES IN CIGAR ALREADY LESS)
#elif newcigarletterlist[index] == 'I':
#numbertoClip=numbertoClip
elif newcigarletterlist[index] == 'D':
#because more of target will be covered if there is an deletion
#DONT NEED TO MULTIPLY BY TWO BECAUSE THE NUMBER COVERED WILL BE LESS
#numbertoClip=numbertoClip+(int(2*newcigarnumberlist[index]))
numbertoClip=numbertoClip+(int(newcigarnumberlist[index]))
elif newcigarletterlist[index] == 'S':
#soft clips are only at the end and since for loop started at 1 this means at the end, if it hasn't
#made the target yet will set numbertoClip at zero so in the next if statement it will be set as softEnd
numbertoClip=0
#has to be equal too since 0 represents it exactly reached the target point, using matchEnd, softEnd and insertionEnd after that
if numbertoClip >= 0:
if foundIndexChanged==False:
foundIndexChanged=True
indexchangingAll=index
if newcigarletterlist[index] == 'I':
#numbertoClip=numbertoClip+int(newcigarnumberlist[index])
insertionEnd=insertionEnd+int(newcigarnumberlist[index])
elif newcigarletterlist[index] == 'M' and index > indexchangingAll:
matchEnd= matchEnd + int(newcigarnumberlist[index])
elif newcigarletterlist[index] == 'S':
softEnd= matchEnd + int(newcigarnumberlist[index])
elif newcigarletterlist[index] == 'D':
if index==indexchangingAll:
#can't have indexchanging on a deletion, reseting
#indexchangingAll=len(newcigarnumberlist)-1
numbertoClip=0
deletionEnd = deletionEnd + int(newcigarnumberlist[index])
#numbertoClip=runningTotal-targetLength
if numbertoClip > 0 and dontClipEnd==False:
newcigar2 = ""
newcigarnumberlist2 = []
newcigarletterlist2 = []
for index in range(len(newcigarnumberlist)):
if index ==indexchangingAll:
currentNumber=int(newcigarnumberlist[index])
currentLetter=newcigarletterlist[index]
if currentLetter =='M':
if currentNumber-numbertoClip > 0:
newcigarletterlist2.append('M')
newcigarnumberlist2.append(str(currentNumber-numbertoClip))
newcigarletterlist2.append('S')
newcigarnumberlist2.append(str(numbertoClip+insertionEnd+matchEnd+softEnd))
else:
newcigarletterlist2.append('S')
newcigarnumberlist2.append(str(numbertoClip+insertionEnd+matchEnd+softEnd))
else:
newcigarletterlist2.append('S')
newcigarnumberlist2.append(str(numbertoClip+insertionEnd+matchEnd+softEnd))
elif index < indexchangingAll:
newcigarletterlist2.append(newcigarletterlist[index])
newcigarnumberlist2.append(str(newcigarnumberlist[index]))
else:
dontClipEnd=True
else:
dontClipEnd=True
if dontClipEnd != True:
overlappingReads=True
newcigarletterlist=newcigarletterlist2
newcigarnumberlist=newcigarnumberlist2
if booleanIsReverse:
newcigarnumberlist = newcigarnumberlist[::-1]
newcigarletterlist = newcigarletterlist[::-1]
for index in range(len(newcigarletterlist)):
###TODO GET RID OF IF FOUND NOT TO HAVE ERRORS
##if int(newcigarnumberlist[index]) <= 0:
### continue
newcigar = newcigar + newcigarnumberlist[index] + newcigarletterlist[index]
# Sum of lengths of the M/I/S/=/X operations shall equal the length of SEQ
if booleanIsReverse:
reverseread.cigarstring = newcigar
if dontClipEnd != True:
#####reverseread.pos = reverseread.pos + reverseread.qstart - oldAlignmentStart + DeletionCount - InsertionCount - insertionEnd + deletionEnd
reverseread.pos = reverseread.pos + reverseread.qstart - oldAlignmentStartR - insertionEnd + deletionEnd
###
#if abs(startManifestPos-oldAlignmentStartR) < abs(endManifestPos-oldAlignmentStartR):
# reverseread.pos = startManifestPos + expectedEndReadMatchLength
#else:
# reverseread.qstart = EndManifestPos + expectedEndReadMatchLength
if not booleanIsReverse:
forwardread.cigarstring = newcigar
# need to adjust position if a deletion was present in the primer area so adding + numbertosubtract
if startManifestPos - oldPositionStart != 1:
forwardread.pos = forwardread.pos + forwardread.qstart - oldAlignmentStart + DeletionCount - InsertionCount + (startManifestPos - oldPositionStart - 1)
else:
forwardread.pos = forwardread.pos + forwardread.qstart - oldAlignmentStart + DeletionCount - InsertionCount
###forwardread.pos = readExpectedPostion + DeletionCount - InsertionCount
#if abs(startManifestPos-oldAlignmentStart) < abs(endManifestPos-oldAlignmentStart):
# forwardread.qstart = startManifestPos + expectedEndReadMatchLength
#else:
# forwardread.qstart = EndManifestPos + expectedEndReadMatchLength
RealOnly = regex.compile(ur'[MIDN]')
RealLetterlist = regex.findall(RealOnly, newcigar)
# http://stackoverflow.com/questions/7368789/convert-all-strings-in-a-list-to-int
numberlistint = map(int, newcigarnumberlist)
calCigarLength = 0
zeroLengthElement=False
for eachindex in range(0, len(newcigarletterlist)):
if (newcigarletterlist[eachindex] != 'D') and (newcigarletterlist[eachindex] != 'H') and (newcigarletterlist[eachindex] != 'N') and (newcigarletterlist[eachindex] != 'P'):
calCigarLength = abs(numberlistint[eachindex]) + calCigarLength
for eachindex in range(0, len(numberlistint)):
if numberlistint[eachindex]== 0:
zeroLengthElement=True
# if letterlist has D,H,N,P; then don't write it down
if (calCigarLength != readLength) or zeroLengthElement:
print "Cigar length did not match or Zero Length Element"
print read.qname
print read.query_length
if booleanIsReverse:
print "the deletion count is " + str(DeletionCount)
print "the insertion count is " + str(InsertionCount)
print "the endManifestPos is " + str(endManifestPos)
print "the oldPositionStart is " + str(oldPositionStart)
print "the totalDeletions is " + str(totalDeletions)
print "the totalInsertions is " + str(totalInsertions)
print "the reverseread.qlen is " + str(readLength)
print "the endManifestPos-oldPositionStart-reverseread.qlen - totalDeletions + totalInsertions is " + str(endManifestPos-oldPositionStart-readLength - totalDeletions + totalInsertions)
print "is read revserse " + str(booleanIsReverse)
print "the length of letterlistR is: " + str(len(letterlist))
print "the length of the matched endreadmatchlength is:" + str(endreadmatchlength)
print "the length of the expected matched primer is:" + str(expectedEndReadMatchLength)
print "the new calculated Cigar Length is: " + str(calCigarLength)
print "the cigar length should be " + str(read.query_length)
print "the original cigar is: " + str(originalcigar)
print "the new cigarstring is: " + str(newcigar)
print "the target length is: " + str(targetLength)
print "return True at if (calCigarLength != read.query_length)"
return [True,overlappingReads]
if (not RealLetterlist):
print "continue at if (not RealLetterlistF) or (not RealLetterlistR)"
return [True,overlappingReads]
return [False,overlappingReads]
def main(argv):
global inputfile
global outputfile
global start_pos
global end_pos
global end_neg
global start_neg
global library_type
global reverseread
global forwardread
global current
inputfile = ''
outputfile = ''
library_type = ''
try:
opts, args = getopt.getopt(argv, "hi:o:t:m:", ["ifile=", "ofile=", "librarytype=", "manifest="])
except getopt.GetoptError:
print 'clipPrimers.py -i <inputfile> -o <outputfile> -t <library_type> -m <Illumina manifest file>'
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
print 'clipPrimers.py -i <inputfile> -o <outputfile> -t <library_type> -m <Illumina manifest file>'
sys.exit()
elif opt in ("-i", "--ifile"):
inputfile = arg
elif opt in ("-o", "--ofile"):
outputfile = arg
elif opt in ("-t", "--librarytype"):
library_type = arg
elif opt in ("-m", "--manifest"):
manifest = arg
print 'The manifest file is "', manifest
print 'Input file is "', inputfile
print 'Output file "', outputfile
#start pos is postive strand ULSO sequences
start_pos=[]
#start neg is negative strand DLSO sequences, note these are reverse complement of sequences in manifest
start_neg=[]
#end pos is positive strand DLSO sequences
end_pos=[]
#end neg is negative strand ULSO sequences, note these are reverse complement of sequences in manifest
end_neg=[]
header=[]
#list where [strand,chr,startPosition,endPosition,currentStartPrimer,currentEndPrimer]
#customPrimerList=[]
rowcount=0
file_read = csv.reader(open(manifest, 'r'), delimiter='\t');
# for row in file_read:
# if rowcount==1:
# #this is a header row
# header=row
# rowcount=rowcount+1
# continue
# elif row[0]=='[Targets]':
# break
# elif rowcount==0:
# rowcount=rowcount+1
# continue
# currentDLSO=row[header.index('DLSO Sequence')]
# currentULSO=row[header.index('ULSO Sequence')]
# startPosition=row[header.index('Start Position')]
# endPosition=row[header.index('End Position')]
# strand=row[header.index('Strand')]
# chr=row[header.index('Chromosome')][3:]
# currentStartPrimer=''
# currentEndPrimer=''
# if strand=='-':
# currentStartPrimer=Bio.Seq.reverse_complement(currentDLSO)
# currentEndPrimer=Bio.Seq.reverse_complement(currentULSO)
# start_neg.append(currentStartPrimer)
# end_neg.append(currentEndPrimer)
# else:
# currentStartPrimer=currentULSO
# currentEndPrimer=currentDLSO
# start_pos.append(currentStartPrimer)
# end_pos.append(currentEndPrimer)
# currentlist=[strand,chr,startPosition,endPosition,currentStartPrimer,currentEndPrimer]
# customPrimerList.append((currentlist[:]))
# rowcount=rowcount+1
header=[]
headernext=False
inTargetsPortion=False
inProbesPortion=False
PrimerDictList=[]
for row in file_read:
if row[0] == '[Probes]':
headernext = True
inProbesPortion = True
inTargetsPortion = False
continue
elif row[0] == '[Targets]':
headernext = True
inTargetsPortion = True
inProbesPortion = False
inTargetInteger = 0
continue
elif row[0] == '[Intervals]':
break
elif headernext == True:
header = row
headernext = False
continue
elif inProbesPortion == True:
currentULSO = str(row[header.index('ULSO Sequence')])
currentDLSO = str(row[header.index('DLSO Sequence')])
currentDict = {'ULSO Sequence':currentULSO, 'DLSO Sequence':currentDLSO}
currentDLSO = row[header.index('DLSO Sequence')]
currentULSO = row[header.index('ULSO Sequence')]
# startPosition=row[header.index('Start Position')]
# endPosition=row[header.index('End Position')]
strand = row[header.index('Strand')]
chr = row[header.index('Chromosome')][3:]
currentStartPrimer = ''
currentEndPrimer = ''
if strand == '-':
currentStartPrimer = Bio.Seq.reverse_complement(currentDLSO)
currentEndPrimer = Bio.Seq.reverse_complement(currentULSO)
else:
currentStartPrimer = currentULSO
currentEndPrimer = currentDLSO
currentDict = {"strand":strand, "chr":chr, "startPos":0, "endPos":0, "currentStartPrimer":currentStartPrimer, "currentEndPrimer":currentEndPrimer,
"startPrimerLength":0, "endPrimerLength":0, "targetLength":0}
PrimerDictList.append(currentDict)
continue
elif inTargetsPortion == True:
chr = str(row[header.index('Chromosome')])
startPos = int(str(row[header.index('Start Position')]))
endPos = int(str(row[header.index('End Position')]))
PrimerDictList[inTargetInteger]["startPos"] = startPos
PrimerDictList[inTargetInteger]["endPos"] = endPos
startPrimerLength = len(PrimerDictList[inTargetInteger]["currentStartPrimer"])
endPrimerLength = len(PrimerDictList[inTargetInteger]["currentEndPrimer"])
targetLength = endPos - startPos - len(PrimerDictList[inTargetInteger]["currentStartPrimer"]) - len(PrimerDictList[inTargetInteger]["currentEndPrimer"])
#adding plus 3 to match the clipping for the start probe (because actually clip before end of probe at the start, which is what illumina does)
PrimerDictList[inTargetInteger]["targetLength"] = targetLength+3
PrimerDictList[inTargetInteger]["startPrimerLength"] = startPrimerLength
PrimerDictList[inTargetInteger]["endPrimerLength"] = endPrimerLength
# print PrimerDictList[inTargetInteger]["targetLength"]
inTargetInteger = inTargetInteger + 1
else:
continue
samfile = pysam.AlignmentFile(inputfile)
# template copies the header from the other alignment file
newreads = pysam.AlignmentFile(outputfile, "wb", template=samfile)
readnum = 0
read1 = None
read2 = None
for read in samfile: # TMS KEY CHANGE (GET RID OF FETCH)
# have to process reads1 and reads2
# the read.rnext==read.rname is another double check they are on the same chromosome
#hence I am filtering out secondary alignments
if read.is_read1 and read.rnext == read.rname and not read.is_secondary: # Store read1 of the pair and iterate the for loop.
read1 = read
continue
elif read.is_read2 and read.rnext == read.rname and not read.is_secondary: # Store read2. Should now have read1 and read2 from a properly-paired read. Do a bit of QC-ing of the read-pair.
read2 = read
if read1 is None:
read1 = None
read1 = None
print "WARNING READ1 EMPTY WHILE READ2 WAS NOT, IF SEE ERROR MULTIPLE TIMES CHECK IF SORTED BY QUERYNAME"
#print "continue at elif read.is_read2 and read.rnext == read.rname:"
continue
elif read2.qname != read1.qname:
read1 = None
read1 = None
#print "continue at elif read2.qname != read1.qname"
continue
elif read1.is_reverse and read2.is_reverse:
#one of the reads has to be forward, filtering out
read1 = None
read2 = None
continue
elif not read1.is_reverse and not read2.is_reverse:
#one of the reads has to be reverse, filtering out
read1 = None
read2 = None
continue
if read1 is None or read2 is None:
#print "continue at if read1 is None or read2 is None"
continue
# if read1.tid != read2.tid then that means the mate is mapped to a different chromosome, so getting rid of
#print read1.cigarstring
if (read1.query_length <= 35) or (read2.query_length <= 35) or (not isinstance(read.cigarstring, str)) or (read1.tid != read2.tid):
# newreads.write(read)
#print "continue at if (read1.query_length <= 35) or (read2.query_length <= 35) or (not isinstance(read.cigarstring, str)) or (read1.tid != read2.tid):"
continue
# ASSUMING READ1 AND READ2 are opposite in read.is_reverse
p = regex.compile(ur'\d+')
p2 = regex.compile(ur'[MIDNSHP=X]')
# would rather know which read is reverse and which is forward, will find out later if one is read1 or read2
#if read1reverse
usePosStrand=True
if read1.is_reverse:
reverseread = read1
forwardread = read2
currentPrimerListF=start_pos
currentPrimerListR=end_pos
usePosStrand=True
else:
reverseread = read2
forwardread = read1
currentPrimerListF=start_neg
currentPrimerListR=end_neg
usePosStrand=False
#if forwardread.cigarstring == '121M' and reverseread.cigarstring == '121M':
# print "The read is 121M on both sides, this read should be written."
# print "The read name is : " + read1.qname
numberlistR=[]
letterlistR=[]
numberlistF=[]
letterlistF=[]
try:
numberlistR = regex.findall(p, reverseread.cigarstring)
letterlistR = regex.findall(p2, reverseread.cigarstring)
numberlistF = regex.findall(p, forwardread.cigarstring)
letterlistF = regex.findall(p2, forwardread.cigarstring)
except:
read1 = None
read2 = None
print "ERROR: in making numberlist and letterlist"
continue
# http://stackoverflow.com/questions/7368789/convert-all-strings-in-a-list-to-int
numberlistintR = map(int, numberlistR)
numberlistintF = map(int, numberlistF)
if letterlistR[len(letterlistR) - 1] == 'H':
if numberlistintR[len(numberlistintR) - 1] > 2:
# then it clipped the end and hence it won't match a primer, filtering out
# newreads.write(read)
#print "continue at if numberlistintR[len(numberlistintR) - 1] > 2:"
#print "the reverseread is :" + str(reverseread.cigarstring)
continue
if letterlistF[0] == 'H':
if numberlistintF[0] > 2:
# then it clipped the start and hence it won't match a primer, filtering out
#print "continue at if numberlistintF[0] > 2:"
#print "the forwardread is :" + str(forwardread.cigarstring)
continue
# if the primer is already soft-clipped, likely probe mismatch, should get rid of, poor quality
if letterlistR[len(letterlistR) - 1] == 'S':
if numberlistintR[len(numberlistintR) - 1] > 10:
#print "continue at if numberlistintR[len(numberlistintR) - 1] > 10:"
#print "the reverseread is :" + str(reverseread.cigarstring)
continue
if letterlistF[0] == 'S':
if numberlistintF[0] > 10:
#print "continue at if numberlistintF[0] > 10:"
#print "the forwardread is :" + str(forwardread.cigarstring)
continue
#throw out if heavily soft clipped at end
if letterlistR[0] == 'S':
if numberlistintR[0] > 50:
#print "continue at if numberlistintR[0] > 50:"
#print "the reverseread is :" + str(reverseread.cigarstring)
continue
if letterlistF[len(numberlistintF) - 1] == 'S':
if numberlistintF[len(numberlistintF) - 1] > 50:
#print "continue at if numberlistintF[0] > 10:"
#print "the forwardread is :" + str(forwardread.cigarstring)
continue
# get read to check on
# get the last 35 reads
target_portion_read_R = reverseread.query_sequence[reverseread.query_length - 35:]
# need read start
target_portion_read_F = forwardread.query_sequence[0:35]
# currentSequence = read.query_sequence
# returns a match object
# print read.query_sequence
# print read.query_alignment_end
# print read.next_reference_start
# print read.is_reverse
#currentPrimerListR = ''
#currentPrimerListR = where_to_look_for_primer(reverseread)
#currentPrimerListF = ''
#currentPrimerListF = where_to_look_for_primer(forwardread)
#[currentPrimerListF,currentPrimerListR] = where_to_look_for_primer_both(forwardread)