-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path5_sig_pcalc_parts.R
executable file
·241 lines (204 loc) · 9.21 KB
/
5_sig_pcalc_parts.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
# This program is "new" because it allows you to work with full arms as well as sections
# and is out because allows you to handle outliers
#################################################
# USAGE
#
# Given the jagged array files created by 4_hom_stats_parts.py,
# this code will compute a p-value for each chromosome arm or section.
# We use permutation tests to arrive at the p-value, and the permutation aspects are located
# in functions_sig.R.
#
# Note: The result is the collection of uncorrected p-values, FDR will be used in scripts to follow
#
# limited functionality for outliers:
# It is possible to add a column within the phenotype file to flag patients that are considered outliers
# for such a case, use the word "outlier" as columname together with some useful identifier for you like
# outlier17qs3 that could be read outlier in section 3 from Chromosome 17q. The script hasn't been fully
# finished to handle outliers smoothly, you will need to create a dictionary part for which the outliers
# will be removed for all sections. Also for now you can only have one column with the word outlier in it
# This functionality still needs work.
#
# Input: The following tab delimited text files.
# 1. The dictionary files from 2_cgh_dictionary_cytoband.R.
# 2. The phenotype file where the patient names matches the order from the aCGH file with the log2 ratios.
# A column with the phenotype (ERBB2, basal, test, sim, etc) coded with 0's and 1's. The column name will be the
# argument "phenotype"
# 3. The jagged array (from 4_hom_stats_parts.py) containing the value of the parameter (B0, B1) for all the patient
# simplicial complexes/networks over the epsilon filtrations. The files should be under
# ~/Results/SET/2D/Homology/B0_2D_SET_chrArm_seg.txt for a specific aCGH file named SET and a specific chromosome-arm
# combination "chrArm"
#
# Output:
# A tab delimited text file with 12 columns (# "Chr", "Arm", "Dimension", "Segment",
# "Idx0.Beg", "Idx0.End", "bpStart", "bpEnd", "CytoStart", "CytoEnd","P.Value", "Test.Stat") which
# gives the p-value and test statistic for each chromosome arm for each computed dimension.
# "Idx0.Beg" and "Idx0.End" are the beginning and ending positions in the CGH file for the sections (0 start)
# Output will be saved under ~/Research/Results/SET/significance/pvals
#################################################
# COMMAND LINE ARGUMENTS
# 4. param (B0, B1, D, CC)
# 5. phenotype (ERBB2, basal, test, sim, etc)
# 6. dataSet (SET, etc)
# 7. partNum (an integer with the part from the dictionary)
# 8. action: arms, sect (for sections)
# 9. outliers to remove: yes, no
# 10. subdir a directory within /dataSet dir to read the dictionaries
# EXAMPLE
# R --vanilla --args B0 Her2 NCMAD 10 sect no valHorl< 5_sig_pcalc_parts.R
#
# Get the command line arguments
args = commandArgs();
param <- args[4];
phenotype <- args[5];
dataSet <- args[6];
partNum <- args[7];
action <- args[8];
outliers <- args[9];
subdir <- args[10];
# for debugging purposes only
#param <- "B0";
#phenotype <- "Hist_grade3";
#dataSet <- "horlings";
#partNum <- "8";
#action<- "sect";
#outliers <- "no";
#subdir <- "sect";
###############################
# READ FILES
# Establish the beginning path
#begPath <- "~/Research";
begPath <- "..";
# TODO: The following section needs cleaning as D and CC are not in use anymore
# Determine "Homology" or "Networks" based on param
if (identical(param,"D") || identical(param,"CC")) {
paramType <- "Networks";
shortParamType <- "net";
} else {
paramType <- "Homology";
shortParamType <- "hom"
}
# The source code for the permutations
#srcPath <- paste(begPath, "Code", "functions_sig.R", sep="/");
srcPath <- paste(begPath, "TAaCGH", "functions_sig.R", sep="/");
source(srcPath);
# Read the chromosome dictionary data
chrDictFile <- paste(dataSet,"_",subdir, "_dict_", partNum, ".txt", sep="");
#chrDictPath <- paste(begPath, "Data", dataSet, chrDictFile, sep="/");
chrDictPath <- paste(begPath, "Data", dataSet, subdir, chrDictFile, sep="/");
chrDict <- read.table(chrDictPath, header=TRUE, sep="\t");
print(paste("dictPath",chrDictPath, sep=" "));
print(paste("partNum",partNum, sep=" "));
# Read the phenotype data
phenFile <- paste(dataSet, "phen.txt", sep="_");
phenPath <- paste(begPath, "Data", dataSet, phenFile, sep="/");
phenData <- read.table(phenPath, header=TRUE, sep="\t");
print(paste("phenPath",phenPath, sep=" "));
###############################
# BEGIN PROGRAM
# getting the indices for each phenotype group
phen1indices <- which(phenData[,phenotype] == 1);
phen2indices <- which(phenData[,phenotype] == 0);
phen1num<-length(phen1indices);
phen2num<-length(phen2indices);
print(paste("phenotype", phenotype, sep=" "));
print("phenData$phenotype", phenData[,phenotype]);
print(paste("There are ", phen1num, " phenotype1", sep=""));
print(paste("There are ", phen2num, " phenotype2", sep=""));
# Write path for p-values and header
uncorrFile <- paste(param, "_", phenotype, "_", dataSet, "_", subdir,"_pvals", "_",partNum, ".txt", sep="");
#uncorrFolder <- paste(begPath, "Results", dataSet, "significance", "pvals", sep="/");
uncorrFolder <- paste(begPath, "Results", dataSet, subdir, "significance", "pvals", param, phenotype, sep="/");
print(paste("dictResults",uncorrFolder, sep=" "));
if(!file.exists(uncorrFolder)) {
dir.create(uncorrFolder, recursive=TRUE);
}
uncorrPath <- paste(uncorrFolder, uncorrFile, sep="/");
# Create a container for the p-value list
pvals <- c();
for(i in c(1:nrow(chrDict)))
{
chr <- chrDict[i,1];
arm <- as.vector(chrDict[i,2]);
chrArm <- paste(chr, arm, sep="");
seg <- chrDict[i,6];
beg <- chrDict[i,3];
end <- chrDict[i,4];
if (action=="sect") {
bpStart <- chrDict[i,7];
bpEnd <- chrDict[i,8];
CytoStart <- as.vector(chrDict[i,9]);
CytoEnd <- as.vector(chrDict[i,10]);
}
print(paste("On chromosome ", chrArm, " segment ", seg, sep=""));
dim<-"2";
print(paste("On dimension ", dim, sep=""));
# Read in data regardless of phenotype
#dataFile <- paste(param, "_", dim, "D_", dataSet, "_", subdir, "_", chrArm, "_seg", seg,".txt", sep="");
dataFile <- paste(param, "_", dim, "D_", dataSet, "_", action, "_", chrArm, "_seg", seg,".txt", sep="");
#dataPath <- paste(begPath, "/Results/", dataSet, "/", subdir, "/", dim, "D/", paramType, "/", dataFile, sep="");
dataPath <- paste(begPath, "/Results/", dataSet, "/", action, "/", dim, "D/", paramType, "/", dataFile, sep="");
print(dataPath);
# Need to determine the maximum number of columns in the file ahead of time
# because, again, R is strange about reading in jagged arrays.
nColData <- max(count.fields(dataPath, sep="\t"));
# Read in parameter data
data <- read.table(dataPath, header=FALSE, sep="\t", fill=TRUE, col.names=1:nColData);
# Fill in the NAs in the jagged array with a parameter appropriate value.
# For B0 and CC that value is 1 because that is the minimum/maximum, respectively.
# For B1 that value is 0.
# For D that value is the end index - start index in chrDict (0-based from cgh_dictionary.py).
if (identical(param,"D")) {
fillValue <- chrDict[i,4] - chrDict[i,3]
data[is.na(data)] <- fillValue;
} else if (identical(param,"CC") || identical(param,"B0")) {
fillValue <- 1.0;
data[is.na(data)] <- fillValue;
} else if (identical(param,"B1")) {
fillValue <- 0.0;
data[is.na(data)] <- fillValue;
}
# Initializing variables
phen1indices_noOut<-phen1indices;
phen2indices_noOut<-phen2indices;
phen1num_noOut<-phen1num;
phen2num_noOut<-phen2num;
outlier<-paste("out_",chrArm, sep="");
# Remove outliers from arm
if (outliers=="yes") {
if ( (outlier %in% colnames(phenData))==TRUE) {
outlierIndx<-which(phenData[,outlier]==1);
phen1indices_noOut<-setdiff(phen1indices, outlierIndx);
phen1num_noOut<-length(phen1indices_noOut);
phen2indices_noOut<-setdiff(phen2indices, outlierIndx);
phen2num_noOut<-length(phen2indices_noOut);
print(paste("After removing outliers in ",outlier,":", sep=""));
print(paste("There are ", phen1num_noOut, " patients for test", sep=""));
print(paste("There are ", phen2num_noOut, " patients for control", sep=""));
}
}
# Separate by phenotype
exp <- data[phen1indices_noOut,];
con <- data[phen2indices_noOut,];
# Compute the pvalue
rndData <- randomize(exp, con, fillValue);
pval <- rndData[[1]];
stat <- rndData[[2]];
sumPos <- rndData[[3]];
sumNeg <- rndData[[4]];
print(pval);
print(stat);
print(partNum);
# Put the pvals in the container
if (action=="sect") {
pvals <- rbind(pvals, c(chr, arm, dim, seg, phen1num_noOut, phen2num_noOut, beg, end, bpStart, bpEnd, CytoStart, CytoEnd, pval, stat, sumPos, sumNeg));
} else {
pvals <- rbind(pvals, c(chr, arm, dim, seg, phen1num_noOut, phen2num_noOut, beg, end, pval, stat, sumPos, sumNeg));
}
}
# Rename columns and write the file
if (action=="sect") {
colnames(pvals) <- c("Chr", "Arm", "Dimension", "Segment", "TestNum", "CtrlNum", "Idx0.Beg", "Idx0.End", "bpStart", "bpEnd", "CytoStart", "CytoEnd","P.Value", "Test.Stat", "AreaPos", "AreaNeg");
} else {
colnames(pvals) <- c("Chr", "Arm", "Dimension", "Segment", "TestNum", "CtrlNum", "Idx0.Beg", "Idx0.End","P.Value", "Test.Stat", "AreaPos", "AreaNeg");
}
write.table(pvals, file=uncorrPath, row.names=F, sep='\t')