-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathnaive_with_rc.py
227 lines (189 loc) · 7.83 KB
/
naive_with_rc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
# returns reverse complement of DNA
def reverseComplement(s):
complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N'}
t = ''
for base in s:
t = complement[base] + t
return t
# naive exact matching algorithm
# Returns a list of occurrences (offset)
def naive(p, t):
occurrences = []
for i in range(len(t) - len(p) + 1): # loop over alignments
match = True
for j in range(len(p)): # loop over characters
if t[i+j] != p[j]: # compare characters
match = False
break
if match:
occurrences.append(i) # all chars matched; record
return occurrences
# naive exact matching algorithm that is strand-aware
# Instead of looking only for occurrences of P in T, additionally look for occurrences of the reverse complement of P in T.
def naive_with_rc(p, t):
r = reverseComplement(p)
if r == p:
return naive(p,t)
else:
return naive(p,t) + naive(r,t)
# naive exact matching algorithm
# Returns a list of occurrences (offset)
def naive_2mm(p, t):
occurrences = []
for i in range(len(t) - len(p) + 1): # loop over alignments
match = True
mismatch_count = 0
for j in range(len(p)): # loop over characters
if t[i+j] != p[j]: # compare characters
mismatch_count+=1
if mismatch_count > 2:
match = False
break
if match:
occurrences.append(i) # all chars matched; record
return occurrences
# parses a DNA reference genome from a file in the FASTA format.
def readGenome(filename):
genome = ''
with open(filename, 'r') as f:
for line in f:
# ignore header line with genome information
if not line[0] == '>':
genome += line.rstrip()
return genome
# parses the read and quality strings from a FASTQ file containing sequencing reads
def readFastq(filename):
sequences = []
qualities = []
with open(filename) as fh:
while True:
fh.readline() # skip name line
seq = fh.readline().rstrip() # read base sequence
fh.readline() # skip placeholder line
qual = fh.readline().rstrip() # base quality line
if len(seq) == 0:
break
sequences.append(seq)
qualities.append(qual)
return sequences, qualities
p = 'CCC'
ten_as = 'AAAAAAAAAA'
t = ten_as + 'CCC' + ten_as + 'GGG' + ten_as
occurrences = naive_with_rc(p, t)
print(p)
print(t)
print(occurrences)
# Assert: [10, 23]
p = 'CGCG'
t = ten_as + 'CGCG' + ten_as + 'CGCG' + ten_as
occurrences = naive_with_rc(p, t)
print(p)
print(t)
print(occurrences)
# Assert: [10, 24]
# wget http://d396qusza40orc.cloudfront.net/ads1/data/phix.fa
phix_genome = readGenome('phix.fa')
occurrences = naive_with_rc('ATTA', phix_genome)
print('offset of leftmost occurrence: %d' % min(occurrences))
# offset of leftmost occurrence: 62
print('# occurrences: %d' % len(occurrences))
# occurrences: 60
# Question 1
# How many times does AGGT or its reverse complement (ACCT) occur in the lambda virus genome?
# E.g. if AGGT occurs 10 times and ACCT occurs 12 times, you should report 22.
lambda_virus_genome = readGenome('lambda_virus.fa')
occurrences = naive_with_rc('AGGT', lambda_virus_genome)
print('AGGT in lambda_virus:')
print('# occurrences: %d' % len(occurrences))
# Question 2
# How many times does TTAA or its reverse complement occur in the lambda virus genome?
# Hint: TTAA and its reverse complement are equal, so remember not to double count.
occurrences = naive_with_rc('TTAA', lambda_virus_genome)
print('TTAA in lambda_virus:')
print('# occurrences: %d' % len(occurrences))
# Question 3
# What is the offset of the leftmost occurrence of ACTAAGT or its reverse complement in the Lambda virus genome?
# E.g. if the leftmost occurrence of ACTAAGT is at offset 40 (0-based) and the leftmost occurrence of its reverse
# complement ACTTAGT is at offset 29, then report 29.
occurrences = naive_with_rc('ACTAAGT', lambda_virus_genome)
print('offset of leftmost occurrence of ACTAAGT: %d' % min(occurrences))
# Question 4
# What is the offset of the leftmost occurrence of AGTCGA or its reverse complement in the Lambda virus genome?
occurrences = naive_with_rc('AGTCGA', lambda_virus_genome)
print('offset of leftmost occurrence of AGTCGA: %d' % min(occurrences))
# Question 5
# As we will discuss, sometimes we would like to find approximate matches for P in T. T
# hat is, we want to find occurrences with one or more differences.
# For Questions 5 and 6, make a new version of the naive function called naive_2mm that allows up to
# 2 mismatches per occurrence. Unlike for the previous questions, do not consider the reverse complement here.
# We're looking for approximate matches for P itself, not its reverse complement. 
# For example, ACTTTA occurs twice in ACTTACTTGATAAAGT, once at offset 0 with 2 mismatches, and once at offset 4 with 1 mismatch.
# So naive_2mm('ACTTTA', 'ACTTACTTGATAAAGT') should return the list [0, 4].
# How many times does TTCAAGCC occur in the Lambda virus genome when allowing up to 2 mismatches?
occurrences = naive_2mm('TTCAAGCC', lambda_virus_genome)
print('TTCAAGCC (up to 2 mismatches) in lambda_virus:')
print('# occurrences: %d' % len(occurrences))
# Question 6
# What is the offset of the leftmost occurrence of AGGAGGTT in the Lambda virus genome when allowing up to 2 mismatches?
occurrences = naive_2mm('AGGAGGTT', lambda_virus_genome)
print('AGGAGGTT (up to 2 mismatches) in lambda_virus:')
print('offset of leftmost occurrence: %d' % min(occurrences))
# Test 1
p = 'CTGT'
ten_as = 'AAAAAAAAAA'
t = ten_as + 'CTGT' + ten_as + 'CTTT' + ten_as + 'CGGG' + ten_as
occurrences = naive_2mm(p, t)
print(occurrences)
# [10, 24, 38]
# Test 2
phix_genome = readGenome('phix.fa')
occurrences = naive_2mm('GATTACA', phix_genome)
print('offset of leftmost occurrence: %d' % min(occurrences))
# offset of leftmost occurrence: 10
print('# occurrences: %d' % len(occurrences))
# occurrences: 79
# Question 7
# Finally, download and parse the provided FASTQ file containing real DNA sequencing reads derived from a human:
# https://d28rh4a8wq0iu5.cloudfront.net/ads1/data/ERR037900_1.first1000.fastq
# Note that the file has many reads in it and you should examine all of them together when answering this question.
# The reads are taken from this study:
# Ajay, S. S., Parker, S. C., Abaan, H. O., Fajardo, K. V. F., & Margulies, E. H. (2011).
# Accurate and comprehensive sequencing of personal genomes. Genome research, 21(9), 1498-1505.
# This dataset has something wrong with it; one of the sequencing cycles is poor quality.
# Report which sequencing cycle has the problem.
# Remember that a sequencing cycle corresponds to a particular offset in all the reads.
# For example, if the leftmost read position seems to have a problem consistently across reads, report 0.
# If the fourth position from the left has the problem, report 3. Do whatever analysis you think is needed to identify the bad cycle. It might help to review the "Analyzing reads by position" video.
def findGCByPos(reads):
gc = [0] * 100
totals = [0] * 100
for read in reads:
for i in range(len(read)):
if read[i] == 'C' or read[i] == 'G':
gc[i] += 1
totals[i] += 1
for i in range(len(gc)):
if totals[i] > 0:
gc[i] /= float(totals[i])
return gc
def phred33ToQ(qual):
return ord(qual) - 33
def createHist(qualityStrings):
# Create a histogram of quality scores
hist = [0]*50
for read in qualityStrings:
for phred in read:
q = phred33ToQ(phred)
hist[q] += 1
return hist
seqs, quals = readFastq('ERR037900_1.first1000.fastq')
# gc = findGCByPos(seqs)
# plt.plot(range(len(gc)), gc)
import matplotlib.pyplot as plt
h = createHist(quals)
print(h)
#for i in range(len(h)):
# print(i)
# print(h[i])
plt.plot(range(len(h)), h)
plt.show()