-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path01-clustering_genes_and_diseases.py
79 lines (66 loc) · 2.93 KB
/
01-clustering_genes_and_diseases.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#!/usr/bin/env python
'''
The purpose of this script is to investigate direct overlaps in genetic risk across diseases. This is done by:
(1) Hierarchical clustering of genes v diseases
(2) Generating a counts matrix with number of directly overlapping genes between disorders
INPUTS:
- .csv file with the list of all genes and all diseases, where each element is the risk score of that gene in relation to that disease
- .csv with hot-one-encoded genes per disease
PROCESS:
- Performs hierarchical clustering on both axes, plots clustermap
- Calculate counts of total number of overlapping genes across diseases
OUTPUTS:
- Clustermap of genes with risk scores in relation to diseases
- Clustermap of counts of overlapping genes between diseases
- .csv file with adjacency matrix, where total counts of genes shared across disorders are summarised
'''
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
###----------------------Set some plotting parameters
sns.set(font_scale=1.1)
#Set colours for disease type color labels
lut = {'AD' : 'purple',
'ALS' : 'purple',
'AnorexiaNervosa' : 'teal',
'BipolarDisorder' : 'teal',
'BrainAneurysm' : 'orange',
'EssentialTremor' : 'purple',
'FrontotemporalDementia' : 'purple',
'IntracranialHemorrhage' : 'orange',
'LBD' : 'purple',
'MajorDepressiveDisorder' : 'teal',
'MigraineDisorder' : 'orange',
'MigraineWithAura' : 'orange',
'MS' : 'orange',
'NarcolepsyCataplexy' : 'orange',
'Narcolepsy' : 'orange',
'NeuroticDisorder' : 'teal',
'OCD' : 'teal',
'PartialEpilepsy' : 'orange',
'PD' : 'purple',
'ProgressiveSupranuclearPalsy' : 'purple',
'RestlessLeg' : 'orange',
'Schizophrenia' : 'teal',
'TouretteSyndrome' : 'teal',
'UnipolarDepression' : 'teal'}
row_colors = df.columns.unique().map(lut)
#Genes v diseases with risk scores
df = pd.read_csv('/Users/melis/Documents/Gene-targets/Disease_gene_associations_OpenTargets/risk-score-all-diseases.csv')
df = df.drop_duplicates(subset = ['symbol'])
df = df.set_index('symbol')
x0, _y0, _w, _h = g.cbar_pos
g = sns.clustermap(df, col_colors = row_colors,yticklabels=False)
g.ax_cbar.set_position([x0, 0.02, g.ax_row_dendrogram.get_position().width/6, 0.1])
g.ax_cbar.set_title('genetic risk')
plt.savefig('plots/01-clustering_genes_and_diseases/clustermap_genetic_association.pdf')
plt.savefig('plots/01-clustering_genes_and_diseases/clustermap_genetic_association.png', dpi = 300)
plt.show()
#Total genes overlapping across diseases
df = pd.read_csv('/Users/melis/Documents/Gene-targets/Disease_gene_associations_OpenTargets/hot-encoded-diseases.csv')
df = df.drop_duplicates(subset = ['genes'])
df = df.set_index('genes')
sns.clustermap(df.T.dot(df), vmax = 400, row_colors=row_colors, col_colors = row_colors)
plt.savefig('plots/01-clustering_genes_and_diseases/clustermap_disease_overlaps.png', dpi = 300)
plt.show()
df.T.dot(df).to_csv('processed_data/01-clustering_genes_and_diseases/counts_overlapping_genes.csv')