-
Notifications
You must be signed in to change notification settings - Fork 0
/
2.Extracting_the_obesity_signature.py
152 lines (96 loc) · 4.69 KB
/
2.Extracting_the_obesity_signature.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
#get_ipython().magic('matplotlib inline')
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ks_2samp
import copy
import sklearn
geno = pd.read_pickle("./output/replication/our/batches1-4_merged/batch1234_geno.p")
pheno = pd.read_pickle("./output/replication/our/batches1-4_merged/batch1234_pheno.p")
# ### Compute first principal component
# We compute the first principal component of the gene-expression dataframe. We also load Gene Symbol annotations.
# In[11]:
v = (PCA().fit(geno)).components_[0]
v_df = pd.DataFrame(columns=["coef","abs_coef"],index = geno.columns)
v_df["coef"] = v
v_df["abs_coef"] = np.abs(v)
v_df.sort_values(by="abs_coef",ascending=False,inplace=True)
v_df.drop("abs_coef",axis=1,inplace=True)
# In[12]:
hugo_df = pd.read_table("./data/HUGO_official_list_20160613.txt")
entrez_to_genesymb = dict(hugo_df[["Entrez Gene ID","Approved Symbol"]].astype("unicode").applymap(lambda x:x.split(".")[0]).values)
# In[13]:
v_df["GeneSymb"] = v_df.index.map(lambda x:entrez_to_genesymb[x] if x in entrez_to_genesymb.keys() else "???")
v_df.index.name = "Entrez"
v_df.set_index("GeneSymb",append=True,inplace=True)
# So now we have a dataframe with genes in the index and the coefficients of the first principal component of batch1-4 as the only column. We have actually sorted the genes by absolute value of coefficient:
# In[14]:
v_df.head()
# ### Set a statistical significance threshold
# The first principal component of the gene-expression data can be thought of as a direction in gene-space. To set a threshold for statistical significance, we compare the coefficients we have with those of a random direction in this same space.
#
# A random direction in a space of dimension $N$ has standard Gaussian coefficients, see [Muller,1959](http://dl.acm.org/citation.cfm?id=377946). If the norm of our signature vector equals one, then the coefficients of a random direction of the same norm must have variance $1/N$, and thus the distribution of the coefficients of a random direction are Gaussians with standard deviation $\sigma = 1/\sqrt{N}$.
#
# We then decide to set a threshold of $5\sigma$, that is, **we keep only those coefficients that are boeyond 5 sigma's** of what we would expect at random.
# In[15]:
N = geno.shape[1]
sigma = 1/np.sqrt(N)
# There are only 65 genes above our $5\sigma$ threshold. We discard the rest:
# In[16]:
idx = np.abs(v_df.coef)>5*sigma
print("genes beyond threshold:",idx.sum())
# In[17]:
signature = v_df.loc[idx]
# ### Plot signature
# Here we reproduce Figure 1d in the manuscript
# In[18]:
v_df["abscoef"] = v_df["coef"].apply(np.abs)
# In[19]:
# do an ensemble of fake signatures
tmp = np.array([sorted(np.abs(np.random.normal(size=v_df.shape[0])/np.sqrt(v_df.shape[0]))) for x in range(100)])
m=tmp.mean(axis=0)
s=tmp.std(axis=0)
# In[20]:
labelsize=16 # fontsize of x,y labels
title_size=18 # fontsize of the title
ticks_size=14 # fontsize of numbers in x,y ticks
latex_size=18 # fontsize of latex-rendered pval, corr_coef annotations
# In[21]:
plt.figure(figsize=(6.5,5))
# internal parameters
t = range(1,N+1)
Ng = 2000
mycolor = "#03589E"
M=20
K2=5
# MAIN PLOT
plt.plot((t[:65]),v_df["abscoef"].values[:65],color="black",linestyle="-",linewidth="2",label="SCORE")
plt.plot((t[65:]),v_df["abscoef"].values[65:],color="black",alpha=0.35,linestyle="-",linewidth="2")
# SHADOWS
for j in [-1,1]:
for K in range(0,M,5):
plt.fill_between(range(Ng),(m+j*(K)*s)[::-1][:Ng],(m+j*(K+5)*s)[::-1][:Ng],
color=mycolor,linewidth=0,alpha=0.7*(M-K)/M)
# RANDOM SIGNATURE
plt.plot((m)[::-1][:Ng],color=mycolor,linestyle="-",linewidth=2,label="RANDOM SCORE")
# 5-SIGMA THRESHOLD
plt.axhline(K2/np.sqrt(v_df.shape[0]),linewidth=1,color=mycolor,linestyle="--",label="%d$\sigma$ threshold" %K2,)
# AXIS
plt.ylim([0,.14])
plt.xlim([0,200])
plt.xlabel("Rank-ordered genes",size=labelsize)
plt.ylabel("Absolute value of coefficient",size=labelsize)
plt.xticks([1,65,50,100,150,200],size=ticks_size)
plt.yticks(size=ticks_size)
plt.legend(fontsize=14)
plt.savefig("./output/replication/our/figures/Figure1d.pdf")
#plt.show()
# ### Save signature
# We save the signature in pickled and CSV form, as well as a list of all the genes in the final gene-expression matrix. These are known as the **background** genes, and must be used to correctly identify overrepresented gene sets.
# In[22]:
signature.to_csv("./output/replication/our/signature/signature.csv")
signature.to_pickle("./output/replication/our/signature/signature.p")
# In[23]:
np.savetxt("./output/replication/our/batches1-4_merged/background_genes.txt",v_df.index.get_level_values(0).values,fmt="%s")