-
Notifications
You must be signed in to change notification settings - Fork 1
/
models.py
421 lines (301 loc) · 14.9 KB
/
models.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
from django.db import models
from django.forms import ModelForm
from django.forms import ValidationError
from django.forms.util import ErrorList
from django.conf import settings
from django.db.models.fields.files import FieldFile
from django.db.models import Count
from urllib import urlopen
from Bio import AlignIO
from Bio.Nexus.Nexus import Tree
from StringIO import StringIO
from datetime import datetime
from itertools import izip
import os, tempfile
import mptt
from noraseq.scores.conservation import calculate_sequence_weights, conservation, window_score
ALIGNMENT_FORMAT_CHOICES = (
('clustal', 'Clustal'),
('fasta', 'Fasta'),
('stockholm', 'Stockholm'),
('emboss', 'EMBOSS'),
)
UPLOAD_DIR = 'noraseq/uploads'
class Alignment(models.Model):
name = models.CharField("Alignment name", max_length=100)
source_url = models.URLField(max_length=1000, blank=True, null=True)
local_file = models.FileField(blank=True, null=True, upload_to=UPLOAD_DIR)
format = models.CharField(
"Format of alignment file",
max_length=max([len(t[0]) for t in ALIGNMENT_FORMAT_CHOICES]),
choices = ALIGNMENT_FORMAT_CHOICES,
)
description = models.TextField("Description of alignment")
context_url = models.URLField("Optional URL with more info", max_length=1000, blank=True)
creation_date = models.DateTimeField(auto_now_add=True)
length = models.IntegerField(blank=True, null=True, editable=False)
newick_tree = models.TextField(blank=True)
@models.permalink
def get_absolute_url(self):
return ('noraseq.views.alignment_detail', [self.id])
def __unicode__(self):
return self.name
def get_biopy_alignment(self):
if '_biopy_alignment' not in self.__dict__:
# don't call set_biopy_alignment(). Want to make biopy_alignment available, not extract from it.
self.local_file.open('r')
self._biopy_alignment = AlignIO.read(self.local_file, self.format)
self.local_file.close()
return self._biopy_alignment
def set_biopy_alignment(self, value):
self._biopy_alignment = value
self.length = value.get_alignment_length()
self.save()
self.extract_related_objects()
biopy_alignment = property(get_biopy_alignment, set_biopy_alignment)
def get_alignment_file_contents(self):
if '_alignment_file_contents' not in self.__dict__:
self.local_file.open('r')
self._alignment_file_contents = self.local_file.read()
self.local_file.close()
return self._alignment_file_contents
def set_alignment_file_contents(self, value):
# cache the value
self._alignment_file_contents = value
# and save to permanent storage
if not self.local_file:
model_field = self.local_file.field # note this works even though self.local_file is falsy
upload_filename = model_field.generate_filename(self, "%d.%s" % (self.id, self.format))
self.local_file = FieldFile(instance=self, field=model_field, name=upload_filename)
self.save()
# Django FieldFile open() method fails if the file doesn't exist in filesystem (even using mode 'w')
upload_file = open(self.local_file.path, 'w')
upload_file.write(value)
upload_file.close()
alignment_file_contents = property(get_alignment_file_contents, set_alignment_file_contents)
def extract_related_objects(self):
self.extract_rows_and_columns()
self.extract_conservation_scores()
self.newick_tree = self.get_newick_tree()
self.save()
tree = Tree(self.newick_tree)
self.extract_clades(tree, tree.root)
def extract_rows_and_columns(self):
row_num = 1
for biopy_seqrec in self.biopy_alignment:
new_row = Row(
alignment=self,
sequence=str(biopy_seqrec.seq).upper(),
name=biopy_seqrec.id, num=row_num)
new_row.save()
row_num += 1
for col_num in range(self.length):
new_col = Column(
alignment=self,
sequence=self.biopy_alignment.get_column(col_num).upper(),
num=col_num+1)
new_col.save()
def get_newick_tree(self):
temp = None
# quicktree expects a stockholm format input file
if self.local_file.name and self.format == "stockholm":
fname = self.local_file.path
else:
temp = tempfile.NamedTemporaryFile()
print "writing stockholm format file..."
AlignIO.write([self.biopy_alignment], temp, "stockholm")
temp.flush()
fname = temp.name
print "opening quicktree on stockholm format file %s" % fname
quicktree_out = os.popen('quicktree %s' % fname) # subprocess.Popen hangs the Django dev server
# there should be some elementary error checking here...
newick_tree = quicktree_out.read()
print "quicktree finished"
if temp:
# 'temp' is unlinked immediately after creation--so be sure to close it only after we're certain
# that quicktree succesfully opened it (i.e, only after read(), not just after popen())
temp.close()
return newick_tree
def extract_clades(self, tree, n):
print "extracting clade %d" % n
node = tree.node(n)
c = Clade(alignment=self, num=node.id, local_branch_length=node.data.branchlength)
if node.prev is not None:
c.parent = self.clades.get(num=node.prev)
c.cumulative_branch_length = c.parent.cumulative_branch_length + c.local_branch_length
if not node.succ:
# no successors -> leaf node
c.row = self.rows.get(name=node.data.taxon)
c.save()
for child in node.succ:
self.extract_clades(tree, child)
def get_threshold_grouping(self, threshold):
"""
Given a threshold branch length, cut the tree at that branch length.
Returns a ThresholdGrouping object corresponding to the cut, with the property that any time
a given threshold corresponds to a previously-seen cut, the same ThresholdGroupSet is returned.
"""
root_clades = self.clades.filter(cumulative_branch_length__gt=threshold,
parent__cumulative_branch_length__lte=threshold)
root_clade_ids = list(root_clades.values_list('id', flat=True)) # i.e., prevent requerying
if not root_clade_ids:
# no root clades (threshold is negative or greater than cumulative_branch_length for all clades)
# so we should consider all clades to be in the same group, defined by the root clade
root_clade_ids = [self.clades.all()[0].get_root().id]
root_clades = Clade.objects.filter(id=root_clade_ids[0])
# This query finds all the ThresholdGroupings that contain *all* clades in the 'root_clades' queryset.
# Note that it's an obvious proof that, if a set of C of clades represents some cut, then can be no
# other cut that contains all members of C.
# So (modulo consistency issues) there should be at most one ThresholdGrouping returned, and it should
# contain exactly the same set of root clades as in 'root_clades'
existing_groupings = self.threshold_groupings.filter(
root_clades__id__in=root_clade_ids
).annotate(
num_clades=Count('root_clades')
).filter(
num_clades=len(root_clade_ids)
).order_by(
'id'
)
# if database consistency issue mean there are two existing_groupings, try to always return the first
if len(existing_groupings) > 0:
return existing_groupings[0]
new_grouping = ThresholdGrouping(alignment=self, threshold=threshold)
new_grouping.save()
new_grouping.root_clades = root_clades
return new_grouping
def extract_conservation_scores(self):
# okay, this could surely be modified to hit only the 'columns' table instead of rows and columns
seq_list = self.rows.values_list('sequence', flat=True)
seq_weights = calculate_sequence_weights(seq_list)
cols = self.columns.order_by('num')
scores = window_score([conservation(col.sequence, seq_weights) for col in cols])
for col, score in izip(cols, scores):
cons = Conservation(column=col, score=score, pending=False)
cons.save()
class Row(models.Model):
alignment = models.ForeignKey(Alignment, related_name = 'rows', db_index=True)
num = models.IntegerField(editable=False, db_index=True)
name = models.CharField(max_length=100)
sequence = models.TextField()
comment = models.TextField(blank=True)
def __unicode__(self):
return self.name
class Meta:
unique_together = ('alignment', 'num')
class Column(models.Model):
alignment = models.ForeignKey(Alignment, related_name = 'columns', db_index=True)
num = models.IntegerField(editable=False, db_index=True)
sequence = models.TextField()
comment = models.TextField(blank=True)
class Meta:
unique_together = ('alignment', 'num')
class Cell(models.Model):
row = models.ForeignKey(Row, db_index=True)
column = models.ForeignKey(Column, db_index=True)
comment = models.TextField(blank=True)
class Meta:
unique_together = ('column', 'row')
class Clade(models.Model):
alignment = models.ForeignKey(Alignment, related_name='clades', db_index=True)
num = models.IntegerField(editable=False)
cumulative_branch_length = models.FloatField(default=0.0, editable=False)
local_branch_length = models.FloatField(editable=False)
row = models.OneToOneField(Row, null=True) # leaf nodes only
parent = models.ForeignKey('self', null=True, related_name='children')
# use django-mptt to manage tree structure of Clades. Possibly this should be in __init__.py?
try:
mptt.register(Clade, order_insertion_by=["num"])
except mptt.AlreadyRegistered:
pass
class ThresholdGrouping(models.Model):
alignment = models.ForeignKey(Alignment, related_name='threshold_groupings')
root_clades = models.ManyToManyField(Clade, related_name='threshold_groupings')
threshold = models.FloatField()
class RowGroup(models.Model):
# issue. Would like to have a 'Group' table against which to score relatedness.
# ThresholdGroupings are currently
# essentially the same as Clades. But other groupings will be specified differently...
# solution.
# 1. create the 'RowGroup' model
# 2. for each type of row group (threshold group, column-sorted group), create a model with a one-to-one
# relationship with RowGroup. This model contains the uniquely specifying info for that group (i.e.,
# the root clade for a threshold_group; a M2M field on Rows for manual-sort groups, etc.)
# 3. Possibly include an M2M relationship with Rows in RowGroup model -- but notice this is a sort
# of denormalization (the augmenting model should contain the methods necessary to calculate the rows)
# 4. Update Alignment.get_threshold_grouping, ThresholdGrouping (which should now point to RowGroups
# instead of Clades), and the Piston resources that return groupings
pass
class UniqueColumnScore(models.Model):
column = models.OneToOneField(Column, db_index=True)
score = models.FloatField(null=True) # null -> not calculated for this col (too many gaps)
pending = models.BooleanField() # whether the calculation's value is available yet
class Meta:
abstract = True
class Conservation(UniqueColumnScore):
pass
class RelatednessScore(models.Model):
related_column = models.ForeignKey(Column)
value = models.FloatField(null=True)
pending = models.BooleanField()
def save(self, *args, **kwargs):
if (column1.alignment != column2.alignment):
return # FIXME what exception should this raise?
super(Relatedness, self).save(*args, **kwargs)
class Meta:
unique_together = ('subject', 'related_column')
abstract = True
class ColumnRelatedness(RelatednessScore):
subject = models.ForeignKey(Column, db_index=True, related_name='relatedness_scores')
class CladeRelatedness(RelatednessScore):
subject = models.ForeignKey(Clade, db_index=True, related_name='relatedness_scores')
class RowGroupRelatedness(RelatednessScore):
subject = models.ForeignKey(RowGroup, db_index=True, related_name='relatedness_scores')
### forms ###
class BaseAlignmentForm(ModelForm):
class Meta:
model = Alignment
exclude = ['newick_tree']
class CreateAlignmentForm(BaseAlignmentForm):
def clean(self):
"""
# 1. validates that, if there is no local_file, that source_url downloads okay (placing contents in
cleaned_data['remote_url_contents'])
# 2. validates that the resulting file, whether a local file or the contents of remote url, parses
correctly
# 3. Places resulting Biopython Alignment object in cleaned_data['biopy_alignment']
"""
super(CreateAlignmentForm, self).clean()
cleaned_data = self.cleaned_data
source_url = cleaned_data.get('source_url')
local_file = cleaned_data.get('local_file')
format = cleaned_data.get('format')
# this method (clean()) is called whether or not the format field validates.
# Let's not waste resources downloading and parsing files if the format field isn't valid.
if not format:
# remember always to return cleaned_data; field validation will have already supplied the required
# error messages here
return cleaned_data
if local_file:
biopy_alignment_file = local_file
elif source_url:
try:
remote_url_contents = urlopen(source_url).read()
except IOError:
self._errors['source_url'] = ErrorList(['Could not load url: %s' % source_url])
del cleaned_data['source_url']
return cleaned_data
cleaned_data['remote_url_contents'] = remote_url_contents
biopy_alignment_file = StringIO(remote_url_contents)
else:
raise ValidationError("You need to either supply a file or a valid URL.")
try:
biopy_alignment = AlignIO.read(biopy_alignment_file, format)
except ValueError:
raise ValidationError(
"The alignment could not be parsed. Perhaps you chose the wrong format?")
cleaned_data['biopy_alignment'] = biopy_alignment
return cleaned_data
class EditAlignmentForm(BaseAlignmentForm):
class Meta(BaseAlignmentForm.Meta):
exclude = ['source_url', 'local_file', 'format', 'newick_tree']