-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfunctions_data_processing.R
executable file
·320 lines (257 loc) · 12 KB
/
functions_data_processing.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
# Each function which changes the data in some way will leave a receipt of the clones removed at each step
# This function filters a dataset based on the target information (could easily make it for the clones)
# We accept only targets/clones that are in the needles array. In needles array will come from a file
# which is a dictionary between clones, targets, chromosomes, and cytobands. For example, the UCSF HumArray 2.0
# file which can be downloaded from: http://0-www.ncbi.nlm.nih.gov.ilsprod.lib.neu.edu/projects/geo/query/acc.cgi?acc=GPL4420
filter.on.target <- function(cyto, needles, haystack, CGHstart) {
print("Filtering on target...");
nRowsNeedles <- nrow(needles);
filteredHaystack <- c();
for(i in c(1:nRowsNeedles)) {
# Establish the needle, the indices in the haystack for that needle, and the data
# in the haystack that goes with that needle.
needleTarget <- toString(needles$Target[i]);
haystackIndices <- which(as.matrix(haystack$Target) == needleTarget);
haystackData <- haystack[haystackIndices, ];
# We expect some Targets in haystack to not be in needles, but it is also the case that Targets
# in needles could not be in haystack. This if accounts for that.
if(length(haystackIndices) != 0) {
# Now we're going to reconstitute the haystack with the same CGH information (obviously)
# but with clone, chromosome, and base pair information from the UCSF HumArray 2.0 file
# (needles) and arm/cytoband information from the UCSC Genome Browser (cyto).
# For other datasets, we will still use the cyto file (possibly updated versions) and
# we will use clone, target, chromosome, base pair information which describes whatever
# array was used for the experiment.
# This will standardize the data structure and order.
# Grab what you can from the UCSF HumArray file.
needleClone <- toString(needles$Clone[i]);
needleChrom <- toString(needles$Chrom[i]);
needleBP <- as.integer(needles$bp[i]);
# Set up the grab from the cytoband file.
# First, select the relevant chromosome information from the cytoband file.
cytoIndices <- which(as.matrix(cyto$Chrom) == needleChrom);
cytoData <- cyto[cytoIndices, ];
nRowsCyto <- nrow(cytoData);
# Determine extremes of the basepairs of the chromosome
minBP <- as.integer(cytoData$Begin[1]);
maxBP <- as.integer(cytoData$End[nRowsCyto]);
# Make sure we'll find what we need. If the needleBP exceeds the maxBP from the
# cytoBand file assign the arm and cytoband to the last row.
if(needleBP > maxBP) {
# stop(paste("The base pair of the needle (", needleClone,")", needleBP,"is outside [", minBP, ",", maxBP,"]", sep=" ", collapse=NULL));
needleArm <- "q";
needleCytoband <- toString(cytoData$Band[nRowsCyto]);
} else {
# Find the correct arm and cytoband
for(j in c(1:nRowsCyto)) {
# Check the row to see if needleBP is here and extract relevant information.
if(cytoData$Begin[j] < needleBP && needleBP < cytoData$End[j]) {
needleArm <- toString(cytoData$Arm[j]);
needleCytoband <- toString(cytoData$Band[j]);
break;
}
}
}
# Stitch together the haystack data with the front matter.
for(k in c(1:nrow(haystackData))) {
row <- cbind(needleClone, needleTarget, needleChrom, needleArm, needleBP, needleCytoband, haystackData[k, CGHstart:ncol(haystackData)]);
filteredHaystack <- rbind(filteredHaystack, row);
}
}
# Print a progress report
if(i %% 100 == 0) {
print(paste("On row", i, "of", nRowsNeedles, sep=" ", collapse=NULL));
}
}
# Name the front matter in a standard way. The remaining colnames are samples.
colnames(filteredHaystack) <- c("Clone", "Target", "Chrom", "Arm", "bp", "Cytoband", colnames(haystack)[CGHstart:length(colnames(haystack))]);
return(filteredHaystack);
}
# Assign the appropriate arm and cytoband based on the $bp column of needles, which here is the dataset
assign.arms <- function(cyto, needles, CGHstart) {
print("Assigning arm information...");
nRowsNeedles <- nrow(needles);
armedNeedles <- c();
for(i in c(1:nRowsNeedles)) {
needleClone <- toString(needles$Clone[i]);
needleChrom <- toString(needles$Chrom[i]);
needleBP <- as.integer(needles$bp[i]);
# Set up the grab from the cytoband file.
# First, select the relevant chromosome information from the cytoband file.
cytoIndices <- which(as.matrix(cyto$Chrom) == needleChrom);
cytoData <- cyto[cytoIndices, ];
nRowsCyto <- nrow(cytoData);
# Determine extremes of the basepairs of the chromosome
minBP <- as.integer(cytoData$Begin[1]);
maxBP <- as.integer(cytoData$End[nRowsCyto]);
# Make sure we'll find what we need. If the needleBP exceeds the maxBP from the
# cytoBand file assign the arm and cytoband to the last row.
if(needleBP > maxBP) {
# stop(paste("The base pair of the needle (", needleClone,")", needleBP,"is outside [", minBP, ",", maxBP,"]", sep=" ", collapse=NULL));
needleArm <- "q";
needleCytoband <- toString(cytoData$Band[nRowsCyto]);
} else {
# Find the correct arm and cytoband
for(j in c(1:nRowsCyto)) {
# Check the row to see if needleBP is here and extract relevant information.
if(cytoData$Begin[j] <= needleBP && needleBP < cytoData$End[j]) {
needleArm <- toString(cytoData$Arm[j]);
needleCytoband <- toString(cytoData$Band[j]);
break;
}
}
}
# Build the new data
row <- cbind(needleClone, needleChrom, needleArm, needleBP, needleCytoband, needles[i, CGHstart:ncol(needles)]);
armedNeedles <- rbind(armedNeedles, row);
# Print a progress report
if(i %% 100 == 0) {
print(paste("On row", i, "of", nRowsNeedles, sep=" ", collapse=NULL));
}
}
###################
# This 2 really needs to be a 1 for the chin dataset because we get the bp information unlike with Climent...
CGHstart <- CGHstart + 1;
colnames(armedNeedles) <- c("Clone", "Chrom", "Arm", "bp", "Cytoband", colnames(armedNeedles)[CGHstart:length(colnames(armedNeedles))]);
return(armedNeedles);
}
# Need to search for duplicate clones and average them to one clone
# Inputs are data (column names to be elaborated upon later), CGHstart (the column index at which CGH data starts)
# Will verify that clone names are the same and also check kb locations (throwing an error if they're not)
average.duplicates <- function(data, CGHstart) {
print("Averaging duplicate probes...");
nRows <- nrow(data);
# Container for the result
newData <- c();
# Container for probes with duplicates and their number
reportDropped <- c();
i = 1;
while(i < nRows) {
# Define the needle, the haystack, and the indices where found
cloneNeedle <- toString(data$Clone[i]);
cloneHaystack <- as.matrix(data$Clone);
foundIndices <- which(cloneHaystack == cloneNeedle);
# Extract the duplicate clones
duplicateClones <- data[foundIndices,];
# If duplicateClones has more than one row, average over the columns while ignoring NAs.
# Have to replace NaNs from entire column being NA with NA for later imputation.
if(nrow(duplicateClones) != 1) {
# Check the base pair locations are the same by seeing if standard deviation is 0
genLocs <- as.numeric(as.matrix(duplicateClones$bp));
if(sd(genLocs) != 0) {
stop(paste("Error: Found clone match for", cloneNeedle,"but kb locations do not match.", sep=" ", collapse=NULL))
}
# Save the list of duplicate clones in their original form
reportDropped <- rbind(reportDropped, duplicateClones);
# Column means and replacement of NaN with NA
averagedCGH <- colMeans(duplicateClones[,CGHstart:ncol(data)], na.rm=T);
averagedCGH[is.nan(averagedCGH)] <- NA;
# Put together the averaged data with the front matter information for the clone.
# Grab the front matter of the first item in duplicateClones since it's the same for all.
newData <- rbind(newData, c(duplicateClones[1,1:CGHstart-1], averagedCGH));
} else {
# If there are no duplicates just add the unadulterated entry to newData.
newData <- rbind(newData, duplicateClones);
}
# Advance to the next row outside of the (possibly duplicate) block
i = i + nrow(duplicateClones);
# Print a progress report
if(i %% 100 == 0) {
print(paste("On row", i, "of", nRows, sep=" ", collapse=NULL));
}
}
# Return both the new data containing the averages of duplicate clones and the
# original clone data which were averaged.
return(list(newData, reportDropped));
}
# Reduce according to proportion of missing NAs
row.reduce.NAs <- function(data, CGHstart, maxMissingProp) {
print("Eliminating rows with too many NAs...");
nRows <- nrow(data);
reducedData <- c();
reportDropped <- c();
for(i in c(1:nRows)) {
propNA <- length(which(is.na(data[i,CGHstart:ncol(data)]))) / (ncol(data) - CGHstart);
if(propNA < maxMissingProp) {
reducedData <- rbind(reducedData, data[i,]);
} else {
reportDropped <- rbind(reportDropped, data[i,]);
}
# Print a progress report
if(i %% 100 == 0) {
print(paste("On row", i, "of", nRows, sep=" ", collapse=NULL));
}
}
return(list(reducedData, reportDropped));
}
# Verify the base pair orderings are correct are ascending within each chromosome
verify.kb.order <- function(data) {
}
# Verify that the clone ordering is the same
verify.clone.order <- function(data1, data2) {
print("Verifying clone parity...");
# Check that the lengths are the same
if(nrow(data1) != nrow(data2)) {
print("More clones in one dataset than the other. Apply match.clones.");
return(list(FALSE, "Length"));
}
# At this point the lengths are the same so we should make
# sure the order is actually the same.
for(i in c(1:nrow(data1))) {
if(toString(data1$Clone[i]) != toString(data2$Clone[i])) {
# Pretty sure this never happens by virtue of the way match.clones works
print("Clones are not in the same order. Apply order.clones.");
return(list(FALSE, "Order"));
}
}
# If we haven't left by now...
print("The clones in the datasets are identical.");
return(list(TRUE, "Great"));
}
# Match clones in two datasets. This will ensure that the set of clones in each dataset
# is the same, but will not necessarily guarantee the same ordering. That's for order.clones.
intersect.clones <- function(data1, data2) {
print("Intersect clones...");
data1Clones <- as.matrix(data1$Clone);
data2Clones <- as.matrix(data2$Clone);
commonClones <- intersect(data1Clones, data2Clones);
# The order here is CRUCIAL. When wrapping %in% in a which statement, the indices
# returned are with respect to the X in X %in% Y. This is counter-intuitive, but a
# simple example clears it up.
data1Indices <- which(data1Clones %in% commonClones);
data2Indices <- which(data2Clones %in% commonClones);
# Extract the commonClones from each dataset
matchedData1 <- data1[data1Indices, ];
matchedData2 <- data2[data2Indices, ];
# Report which clones have been removed from each dataset.
droppedClones1 <- setdiff(data1Clones, commonClones);
droppedClones2 <- setdiff(data2Clones, commonClones);
droppedIndices1 <- which(data1Clones %in% droppedClones1);
droppedIndices2 <- which(data2Clones %in% droppedClones2);
# Extract the dropped clones from each dataset
droppedData1 <- data1[droppedIndices1, ];
droppedData2 <- data2[droppedIndices2, ];
return(list(matchedData1, matchedData2, droppedData1, droppedData2));
}
# Order out-of-order data (oooData) based on a masterClones column vector. This will be called
# after matching is performed, so they will necessarily have the same length and the
# same membership!
order.clones <- function(masterClones, oooData) {
print("Ordering clones to match the master list...");
nRows <- nrow(masterClones);
# The ordered data
oData <- c();
for(i in c(1:nrow(masterClones))) {
# What are we trying to find?
cloneNeedle <- toString(masterClones[i,1]);
# Get the index of the clone in oooData.
# We can be sure of a unique answer because all duplicates are gone.
cloneIndexOoo <- which(oooData$Clone == cloneNeedle);
oData <- rbind(oData, oooData[cloneIndexOoo, ]);
# Print a progress report
if(i %% 100 == 0) {
print(paste("On row", i, "of", nRows, sep=" ", collapse=NULL));
}
}
return(oData);
}