-
Notifications
You must be signed in to change notification settings - Fork 1
/
methods.py
277 lines (213 loc) · 9.62 KB
/
methods.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
"""The methods used to run the Pharmacogenomic ML pipeline.
It includes each of the methods necessary for preprocessing, normalization,
feature selection, domain adaptation and drug resistance prediction
"""
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.feature_selection import SelectKBest, SelectPercentile, f_regression, mutual_info_regression, SelectFromModel, VarianceThreshold
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn import linear_model
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import QuantileTransformer
from functools import reduce
from jive.AJIVE import AJIVE
from jive.PCA import PCA
def combine(ge, dr, drug, metric='AUC_IC50'):
"""Combines drug resistance and gene expression data
This method puts gene expression and drug resistance data together into one dataframe,,
drug resistance is incorporated using the defined metric
Args:
ge (`pandas.DataFrame`): gene expression data
dr (`pandas.DataFrame`): drug resistance data
drug (str): drug name
metric (str): determines the drug resistance measure that will be used.
Returns:
A pandas dataframe containing both gene expression and drug resistance measurements
for the given drug.
"""
return ge.join(dr[dr['Drug_name'] == drug][['CCL', metric]].set_index('CCL'), how='right')
def pre(data, p = 0.1, t = 4):
"""Applies pre-processing to data
Performs pre-processing on the gene expression data and returns a dataframe containing the
selected genes. This pre-processing is used to find unexpressed genes that a
microarray could detect as background noise.
Args:
data (`pandas.DataFrame`): contains the data for which pre-processing will be made, rows should be
cancer cell lines and columns should be genes.
t (float): determines the threshold below which genes are considered to be unexpressed.
p (float): is in the range ]0,1] and determines what is the minimum percentage of the CCLs that
needs to be expressed. If the actual percentage is smaller then that specific gene will be dropped.
Returns:
A Pandas dataframe with only the selected genes included.
"""
#We look for elements below the threshold
data = data.fillna(0)
under = (data.to_numpy()>t).T.astype(np.int8)
#If p is bigger than one we consider it the number of CCLs in which a gene
#needs to be express to be selected. Otherwise it's the percentage of the total.
if p < 1:
n = [np.count_nonzero(i) > p * data.shape[0] for i in under]
else:
n = [np.count_nonzero(i) > p for i in under]
names = {data.keys()[k]:v for k, v in enumerate(n)}
index = [k for k,v in names.items() if v]
return data[index]
def norm(model, ge):
"""Applies normalization to data
Normalizes the given data using the given model then returns the normalized data.
Args:
model(`sklearn.base.TransformerMixin`): a normalization method on which fit_transform can be called
ge(`pandas Dataframe` or `numpy array`): the data that should be normalized
Returns:
A pandas DataFrame or numpy aarray containing the transformed data.
"""
if model == QuantileTransformer:
model = model(output_distribution='normal')
else:
model = model()
return model.fit_transform(ge)
def group_ajive(data,joint):
"""Performs Angle-based Joint and Individual Variation (AJIVE), a type of Domain Adaptation
This method performs domain adaptation by learning the joint space projections from the train
set and then projecting the test set onto the joint space. For this, the methods `ajive` and
`ajive_predict` are used respectively.
Args:
data (`pandas DataFrame`): the data for which domain adaptation is being performed
joint (int): determines the rank of the joint space of the domains
"""
result = {}
train, model = ajive(data['train'],joint)
result['train'] = train
result['test'] = ajive_predict(model, data['test'])
return result
def ajive(data, joint):
#Only overlapping cancer cell lines will be used
blocks = {j:data.loc[j] for j in data.index.levels[0]}
ccls = {k:v.index for k,v in blocks.items()}
overlap = reduce(np.intersect1d, ccls.values())
blocks = {k:v.loc[overlap] for k, v in blocks.items()}
#Calculates the number of singular values that should be used for the PCA
#projections of the blocks
init = {k:svals(v, joint) for k,v in blocks.items()}
model = AJIVE(init, joint_rank=joint)
model.fit(blocks)
result = ajive_predict(model, data)
return result, model
def ajive_predict(model, data):
mod = {}
for i in model.blocks.keys():
for j, ele in enumerate(model.blocks[i].joint.predict_scores(data.loc[i])):
mod[(i, data.loc[i].index[j])] = list(ele)
result = pd.DataFrame(mod.values(), index=mod.keys())
return result
def svals(data, joint):
init = 0
last = 0
previous_step = 0
for i in PCA().fit(data).svals_:
if last-i < 0.2*previous_step:
init += 1
previous_step = last-i
last = i
return max(init, joint)
def fs(model, X_train, X_test, y, n=0, tuning=None):
"""Selects features using the given model
This method selects the best features based on a given model.
Args:
model (estimator object): this is the method that will
be used to determine the best features.
X_train (`pandas DataFrame` or `numpy array`): train data on which
the best features will be selected.
X_test (`pandas DataFrame` or `numpy array`): test data, this will only be
transformed by selecting the specified features.
y (`pandas DataFrame` or `numpy array`): labels of the train set
n (float): determines what percentage of the features will be selected
tuning (`classes.tuning`): defines a hyper parameter search space over which the feature selection
method can be optimized.
Returns:
Two DataFrames, each containing the train and test set with only the selected features
and a dictionary containing the selected features and their respective weights.
"""
# This is used for model-based feature selection
if model in [DecisionTreeRegressor, RandomForestRegressor, linear_model.ElasticNet, linear_model.Lasso]:
model = model()
model.fit(X_train, y)
if tuning != None:
r = RandomizedSearchCV(model, tuning.space, n_iter = tuning.iterations, n_jobs=tuning.jobs, cv= tuning.cv, scoring = tuning.scoring, iid=False)
r.fit(X_train, y)
model = r.best_estimator_
n = n*X_train.shape[1]
fs = SelectFromModel(estimator = model, max_features = int(n), threshold=-np.inf)
X_train = fs.fit_transform(X_train, y)
# This is used for variance threshold selection
elif model == VarianceThreshold:
fs = VarianceThreshold(n)
X_train = fs.fit_transform(X_train)
# This is used for selectkbest and selectpercentile
else:
n = int(n*X_train.shape[1])
fs = SelectKBest(model, k = n)
X_train = fs.fit_transform(X_train, y)
var = fs.get_support()
index = [i for i, x in enumerate(var) if x]
X_test = np.apply_along_axis(lambda x: x[index], 1, X_test)
print('After fs '+str(X_test.shape) +' ' +str(X_train.shape))
return X_train, X_test, var
def jump(domain, n, data):
result = []
for i, ele in enumerate(data):
result.append(ele)
for j in range(0, domain):
result.append(0)
result.append(ele)
for j in range(0, n-domain-1):
result.append(0)
return result
def feda(domains):
"""Performs Frustratingly Easy Domain Adaptation, a type of Domain Adaptation.
This method performs domain adaptation on the drug data by using Frustratingly Easy
Domain Adaptation.
Args:
domains(`pandas DataFrame`): data on which FEDA will be performed, it should
contain the domain on the 0 level of the keys
Returns:
A DataFrame with the transformed data
"""
n = len(domains)
samples = 0
for i in domains:
samples += i.shape[0]
features = domains[0].shape[1]*(n+1)
new = np.zeros(features)
for i, data in enumerate(domains):
for j in data:
new = np.vstack([new, jump(i, n, j)])
return new[1:]
def drp(model, X, y, tuning=None):
"""Trains model on drug data.
Trains the model specified on `model` on the given data by using the specified method.
It then stores the model on the drug's `model` variable.
Args:
model (estimator object): model that will be trained on the data
X (`pandas DataFrame` or `numpy array`): features of the samples
on which the model is trained
y (`pandas DataFrame` or `numpy array`): labels of the samples on which
the model is trained
tuning (`classes.tuning`): hyper parameter search space over which
the model will be optimized. If not specified default values will
be used.
Returns:
The fitted model
"""
if tuning != None:
r = RandomizedSearchCV(model, tuning.space, n_iter = tuning.iterations, n_jobs=tuning.jobs, cv= tuning.cv, scoring = tuning.scoring, iid=False)
r.fit(X, y)
drp = r.best_estimator_
else:
drp = model.fit(X, y)
return drp