-
Notifications
You must be signed in to change notification settings - Fork 1
/
PYlomap.py
430 lines (336 loc) · 17.6 KB
/
PYlomap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
# Heatmap Generation from MicroOrganism Data
# By William Pearson and Hannah Eccleston
#Import Packages
from openpyxl import load_workbook
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import hashlib
import graphviz
###Define Class###
class SampleInformation:
def __init__(self, workBookName, workSheetName, sampleName):
self.workBookName = workBookName
self.workSheetName = workSheetName
self.sampleName = sampleName
self.nameOveride = None
self.iDColumnStart = None
self.iDColumnEnd = None
self.relativeAbundanceColumn = None
self.dataRowStart = None
self.dataRowEnd = None
self.MicroOrganisms = None
self.Parent = None
self.microOrganismTree = []
def SetUpMicroOrganismDatSet(self, iDColumnStart, dataRowStart, iDColumnEnd, dataRowEnd, relativeAbundanceColumn):
self.iDColumnStart = iDColumnStart
self.dataRowStart = dataRowStart
self.iDColumnEnd = iDColumnEnd
self.dataRowEnd = dataRowEnd
self.relativeAbundanceColumn = relativeAbundanceColumn
def AddMicroOrganismList(self, microOrganismList):
self.MicroOrganisms = microOrganismList
def FullDataSetInformation(self):
return ("Name - " + self.sampleName +
" | " + self.iDColumnStart + self.dataRowStart + ":" + self.iDColumnEnd + self.dataRowEnd +
" | " + self.relativeAbundanceColumn + self.dataStart +
":" + self.relativeAbundanceColumn + self.dataRowEnd)
def RelativeAbundanceRange(self):
return (self.relativeAbundanceColumn + self.dataRowStart + ":" + self.relativeAbundanceColumn + self.dataRowEnd)
def BacteriaNameRange(self):
return (self.iDColumnStart + self.dataRowStart + ":" + self.iDColumnEnd + self.dataRowEnd)
def ReturnRelativeAbundanceOfMicroOraganism(self, microOrganismIDToCheck_Hash):
for microOrganism in self.MicroOrganisms:
hashIDToCheck = microOrganism.hashID
if(hashIDToCheck == microOrganismIDToCheck_Hash):
return microOrganism.relativeAbundance
#else
return 0
def CreateMicroOrganismList(self, BacteriaNameList, RelativeAbundanceList):
#Takes a 1-Dimensional OpenPyxl List and converts it into a python list
#Intialise the List that will be filled with python values
microOrganismList = []
#For every item in the openpyxl List, grab the value of that item and append it to the Python List
for i in range(len(RelativeAbundanceList)):
#Grab the relevant values at every row
newDomain = BacteriaNameList[i][0].value
newPhylum = BacteriaNameList[i][1].value
newClassification = BacteriaNameList[i][2].value
newOrder = BacteriaNameList[i][3].value
newFamily = BacteriaNameList[i][4].value
newGenus =BacteriaNameList[i][5].value
newRelativeAbundance = RelativeAbundanceList[i][0].value
#Create a new MicroOrganism object
newMicroOrganism = MicroOrganism(newDomain,
newPhylum,
newClassification,
newOrder,
newFamily,
newGenus,
newRelativeAbundance)
#Generate the Hash ID for the Microorganism
newMicroOrganism.GenerateSelfHashID()
#Add the new Microogranim to the list of microorganisms
microOrganismList.append(newMicroOrganism)
#Return list microOrganism's
return microOrganismList
def DetermineMicroOrganisms(self):
#Load the Data set's Workbook
excelWorkBook = load_workbook(filename=self.workBookName, data_only=True)
#Load the Data set's Sheet
excelSheet = excelWorkBook[self.workSheetName]
#Load the Data set's Relative Abundance
openpyxlList_BacteriaName = excelSheet[self.BacteriaNameRange()]
openpyxlList_RelativeAbundance = excelSheet[self.RelativeAbundanceRange()]
#Create a microOrganism List
microOrganismList = self.CreateMicroOrganismList(openpyxlList_BacteriaName, openpyxlList_RelativeAbundance)
#Add the new Microbe List to the provided DataSet
self.AddMicroOrganismList(microOrganismList)
def GenerateTree(self, workBookName, minColumn, maxColumn, minRow, maxRow, functionDictionary, taxonDictionary):
#Load the NoteBook
excelWorkBook = load_workbook(filename=workBookName, read_only=True)
excelWorkBook = excelWorkBook.active
#Create a new Branch
#Go through each row in the excel workbook
breakIndex = 10000000
currentIndex = 0
for row in excelWorkBook.iter_rows():
currentIndex += 1
#If the sample name is the same as the given data set
if row[0].value == self.sampleName:
#Create new branch with the taxon ID from the parent file
taxonID = row[2].value
taxonRelativeAbundance = row[4].value
newBranch = Branch(taxonID, taxonRelativeAbundance)
#Convert taxon ID to micro organism
newBranch.GenerateTaxonIDHashID()
#Check if the branch for this taxonID has been created already
branchExists = False
for branch in self.microOrganismTree:
if branch.taxonIDHashID == newBranch.taxonIDHashID:
#Create new Function Description pair and generate hash ID
newFunctionDescriptionPair = FunctionDescriptionPair(row[1].value)
newFunctionDescriptionPair.GenerateFunctionHashID()
newFunctionDescriptionPair.GenerateDescriptionFromDictionary(functionDictionary)
#Add new pair to already existing branch
branch.AddNewFunctionDescriptionPair(newFunctionDescriptionPair)
branchExists = True
#If branch does not run this section
if branchExists == False:
#Branch does not exist, so fill the branch with its leaf
#Generate Microoragnism from Taxon ID
newBranch.GenerateMicroOrgansimFromTaxonID(taxonDictionary)
#Add Function Description pair and generate hash ID
newFunction = row[1].value
newFunctionDescriptionPair = FunctionDescriptionPair(newFunction)
newFunctionDescriptionPair.GenerateFunctionHashID()
newFunctionDescriptionPair.GenerateDescriptionFromDictionary(functionDictionary)
#Add new pair to new branch
newBranch.AddNewFunctionDescriptionPair(newFunctionDescriptionPair)
#Add new branch to tree
self.AddBranchToTree(newBranch)
if currentIndex == breakIndex:
return
def AddBranchToTree(self, newBranch):
#Cache temporary list
tempList = self.microOrganismTree
#Add new pair to temporary list
tempList.append(newBranch)
#Replace old list with new list
self.microOrganismTree = tempList
class MicroOrganism:
def __init__(self, domain, phylum, classification, order, family, genus, relativeAbundance):
self.domain = domain
self.phylum = phylum
self.classification = classification
self.order = order
self.family = family
self.genus = genus
self.relativeAbundance = relativeAbundance
self.hashID = None
def FullBacteriaID(self):
return (self.domain +";"+
self.phylum +";"+
self.classification +";"+
self.order +";"+
self.family +";"+
self.genus)
def HighestNameStructure(self):
if(self.genus == "__") or (self.genus == "g__uncultured"):
if(self.family == "__") or (self.family == "f__uncultured"):
if(self.order == "__") or (self.order == "o__uncultured"):
if(self.classification == "__") or (self.classification == "c__uncultured"):
if(self.phylum == "__") or (self.phylum == "p__uncultured"):
return self.domain
else:
return self.phylum
else:
return self.classification
else:
return self.order
else:
return self.family
else:
return self.genus
def RelativeAbundance(self):
return self.relativeAbundance
def GenerateSelfHashID(self):
self.hashID = hashlib.md5(self.FullBacteriaID().encode()).hexdigest()
class Branch:
def __init__(self, taxonID, taxonRelativeAbundance):
self.taxonID = taxonID
self.taxonRelativeAbundance = taxonRelativeAbundance
self.taxonIDHashID = None
self.microOrganism = None
self.functionDescriptionPairs = []
def GenerateTaxonIDHashID(self):
self.taxonIDHashID = hashlib.md5(self.taxonID.encode()).hexdigest()
def GenerateMicroOrgansimFromTaxonID(self, chosenTaxonDictionary):
#For every row in the dictionary
for row in chosenTaxonDictionary.DictionaryData:
#Compare the taxon ID of this branch with the taxon ID in each row
if(self.taxonIDHashID == hashlib.md5(row[0].value.encode()).hexdigest()):
#When it equals, then grab the taxon list
taxonList = row[1].value.split("; ")
#Iterate 6 times to make sure the new MO only has a length of 6, this takes it up to the same number as original document
for i in range(6):
while True:
try:
taxonList[i]
break
except IndexError:
taxonList.append("__")
#Create a new MicroOrganism object
self.microOrganism = MicroOrganism(taxonList[0],
taxonList[1],
taxonList[2],
taxonList[3],
taxonList[4],
taxonList[5],
0)
#Generate hash ID for itself
self.microOrganism.GenerateSelfHashID()
def AddNewFunctionDescriptionPair(self, newPair):
#Cache temporary list
tempList = self.functionDescriptionPairs
#Add new pair to temporary list
tempList.append(newPair)
#Replace old list with new list
self.functionDescriptionPairs = tempList
class FunctionDescriptionPair:
def __init__(self, function):
self.function = function
self.functionHashID = None
self.description = None
def GenerateFunctionHashID(self):
self.functionHashID = hashlib.md5(self.function.encode()).hexdigest()
def GenerateDescriptionFromDictionary(self, chosenFunctionDictionary):
#Go through every row in the chosen dictionary
for i in range(len(chosenFunctionDictionary.DictionaryData)):
if (self.functionHashID == hashlib.md5(chosenFunctionDictionary.DictionaryData[i][0].value.encode()).hexdigest()):
self.description = chosenFunctionDictionary.DictionaryData[i][1].value
class LookUpDictionary:
def __init__(self, workBookName, workSheetName, columnStart, dataRowStart, columnEnd, dataRowEnd):
self.workBookName = workBookName
self.workSheetName = workSheetName
self.columnStart = columnStart
self.dataRowStart = dataRowStart
self.columnEnd = columnEnd
self.dataRowEnd = dataRowEnd
self.DictionaryData = None
self.GenerateDictionary()
def DataRange(self):
return (self.columnStart + self.dataRowStart + ":" + self.columnEnd + self.dataRowEnd)
def GenerateDictionary(self):
#Load the Data set's Workbook
excelWorkBook = load_workbook(filename=self.workBookName, data_only=True)
#Load the Data set's Sheet
excelSheet = excelWorkBook[self.workSheetName]
#Load the Data set
self.DictionaryData = excelSheet[self.DataRange()]
###Visualisation###
def MakeHeatMap(dataSets, LinewidthOveride = 0, PercentToIgnore = 0):
#Initialise the parts of the Data Frame, a list of RA for each DataSet,
# and a list of names for the data sets.
RALists = []
dataSetNameLists = []
microOrganismNameList = []
#Initialise cut down lists at the same time to make them the same size
cutdownRALists = []
cutdownMicroOrganismNameList = []
#Populate lists
for dataSet in dataSets:
RALists.append([])
cutdownRALists.append([])
if(dataSet.nameOveride == None):
dataSetNameLists.append(dataSet.sampleName)
else:
dataSetNameLists.append(dataSet.nameOveride)
#Initialise a list of all microbacteria that appear (Note this uses hashTables for optimisation)
# Add a hashID to the HashID list to make searching faster, also add the highest order name ID.
allMicrobacteriaPresent = []
for dataSet in dataSets:
for i in range(len(dataSet.MicroOrganisms)):
hashIDToCheck = dataSet.MicroOrganisms[i].hashID
if(hashIDToCheck not in allMicrobacteriaPresent):
microOrganismNameList.append(dataSet.MicroOrganisms[i].HighestNameStructure())
allMicrobacteriaPresent.append(hashIDToCheck)
#Now go through the entire mO list and add the relative abundance of that mO of each data set
# If there is no entry for that data set then the value returned is 0
for mO in allMicrobacteriaPresent:
for i in range(len(dataSets)):
RALists[i].append(dataSets[i].ReturnRelativeAbundanceOfMicroOraganism(mO))
#You now have; a list of relative abundances for each data set (RALists)
# a list of microorganism names (microOrganismNameList)
# a list of dataSet names (dataSetNameLists)
#Cut down the list to ignore any values that are below x%
for i in range(len(microOrganismNameList)):
includeMicroOrganism = False
for RAList in RALists:
if (RAList[i] > (PercentToIgnore/100)):
includeMicroOrganism = True
if (includeMicroOrganism):
cutdownMicroOrganismNameList.append(microOrganismNameList[i])
for j in range(len(RALists)):
if(RALists[j][i] > (PercentToIgnore/100)):
cutdownRALists[j].append(RALists[j][i] * 100)
else:
cutdownRALists[j].append(0)
#Now Create a pandas frame work to hold all this information
pandaFrameWork = pd.DataFrame(cutdownRALists, columns=cutdownMicroOrganismNameList, index=dataSetNameLists)
pandaFrameWork_transpose = pandaFrameWork.transpose()
#Plot Data
HeatMapPlot(pandaFrameWork_transpose, LinewidthOveride)
def HeatMapPlot(data, linewidthOveride):
sns.heatmap(data, annot=True, linewidths=linewidthOveride, cmap="YlGnBu", yticklabels=True)
plt.tight_layout()
plt.show()
def DrawTreeDiagram(branch):
g = graphviz.Graph('G', engine='sfdp')
for fdPair in branch.functionDescriptionPairs:
g.edge(fdPair.function, branch.microOrganism.FullBacteriaID())
g.view()
###INITIALISE DATASETS###
dataSet1 = SampleInformation("HannahExcelData.xlsx", "Sheet1", "AA1")
dataSet2 = SampleInformation("HannahExcelData.xlsx", "Sheet1", "BB2")
dataSet3 = SampleInformation("HannahExcelData.xlsx", "Sheet1", "CC01B")
dataSet1.nameOveride = "Hannah1"
dataSet2.nameOveride = "Hannah2"
###SET UP DATA SET
dataSet1.SetUpMicroOrganismDatSet( "A", "3", "F", "203", "G")
dataSet2.SetUpMicroOrganismDatSet( "A", "3", "F", "203", "G")
dataSet3.SetUpMicroOrganismDatSet( "A", "3", "F", "203", "G")
###DETERMINE MICRO-ORAGANISMS###
dataSet1.DetermineMicroOrganisms()
dataSet2.DetermineMicroOrganisms()
dataSet3.DetermineMicroOrganisms()
###Plot HeatMap of MICRO-ORGANISMS###
MakeHeatMap([dataSet1, dataSet2], LinewidthOveride = 0, PercentToIgnore = 1)
###Set Up Dictionaries
FunctionDictionary = LookUpDictionary("Function Dictionary.xlsx", "Sheet 1 - path_abun_unstrat_des", "A", "3", "B", "404")
TaxonDictionary = LookUpDictionary("Taxon dictionary.xlsx", "Sheet 1 - metadata (9)", "A", "4", "C", "2672")
###SET UP TAXON and DESCRIPTION DATA
dataSet3.GenerateTree("Parent.xlsx", "A", "H", "1", "935127", FunctionDictionary, TaxonDictionary)
###Visualise Tree Diagram
DrawTreeDiagram(dataSet3.microOrganismTree[0])