-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy path06_undirected_directed_analysis.R
399 lines (281 loc) · 16.1 KB
/
06_undirected_directed_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
# Elizabeth Parker May 2022
# Untargeted metabolomics workflow stage 6 (Undirected and directed analysis of untargeted metabolomics data)
# Inter operable code to perform basic analysis of untargeted MALDI or DI-MS or LCMS (alternative to Metaboanalyst or SIMCA)
# N.B. You will need your peak table in a specific format, if you have tidied your data using 05_tidy_data_from_ MassUp, XCMS or macro,
# then you should already have the files you need.
# Otherwise you will need a pre-processed peak intensity table saved as a .csv with features (m/z or m/z__RT or m/z bins) as columns
# and samples as rows in a column called "Sample"
# you will need either your treatments (classes/ grouping information) in columns following the first column ("Sample")
# OR you will need a metadata file with "Sample" as the first column followed by you treatments (classes/ grouping information) in separate columns
# Load required packages
packages_to_load <- c("tidyr", "tibble", "dplyr", "readr", "stringr", "ggplot2", "pcaMethods", "muma", "forcats", "vegan")
lapply(packages_to_load, require, character.only = TRUE)
# Don't forget to cite these packages in your thesis/ manuscript (this code will automatically make a table of text citations for you)
cite_packages <- tibble(Package = "1", Citation = "1")
for (i in 1:length(packages_to_load)){
j <- packages_to_load[i]
k <- citation(j)$textVersion
cite_packages[i, 1] <- j
cite_packages[i, 2] <- k
}
cite_packages
# If you tidied your peak table using "05_tidy_data_from..." you can use this function to get the peak table
get_peak_table <- function(MStype){
files <- tibble(list_of_files = list.files("Tidy_data")) %>%
filter(str_detect(list_of_files, "Data_for_metaboanalyst_2factor") == TRUE) %>%
filter(str_detect(list_of_files, MStype) == TRUE)
file_path <- paste("Tidy_data/", files$list_of_files[1], sep="")
file_to_import <- read_csv(file_path)
paste("The function found", file_path, "and imported it for you", sep = " ")
return(file_to_import)
}
peak_table <- get_peak_table(MStype = "XCMS")
## useful manual step here is to check how many features are included in your analysis (i.e. how many variables)
## this should be the number of variables in peak_table minus 1 (the 1 being the column of samples)
## You can report this as e.g. "After grouping and alignment, the processed data comprised XXX features for analysis."
# If you saved the tidied peak table yourself from another source, check it is formatted correctly, and then define it here
# This is also an option if you have e.g. multiple tidied xcms downloads in the same folder
### peak_table <- read_csv("Tidy_data/Specify_you_file_name_here.csv")
get_metadata <- function(MStype){
files <- tibble(list_of_files = list.files("Tidy_data")) %>%
filter(str_detect(list_of_files, "metadata") == TRUE) %>%
filter(str_detect(list_of_files, MStype) == TRUE)
file_path <- paste("Tidy_data/", files$list_of_files[1], sep="")
paste("The function found", file_path, "and imported it for you", sep = " ")
file_to_import <- read_csv(file_path)
return(file_to_import)
}
metadata <- get_metadata(MStype = "XCMS")
# This function will run an initial PCA on your data so you can see whether there are any major outliers, and if
# there's a strong enough pattern to proceed with directed analysis
initial_PCA <- function(MStype){
temp <- peak_table %>%
column_to_rownames(var = "Sample")
#pca model on raw data
raw <- prep(temp, scale="pareto", centre=TRUE)
raw.mod <- pca(raw, method="nipals", nPcs=5)
raw.sum <- as.data.frame(t(summary(raw.mod)))
colnames(raw.sum) <- c("R2", "Cumulative_R2")
raw.sum$PC <- c(1:5)
# add class info so later can use for aesthetics of pca scores plots
#sl_raw <- merge(raw, raw.mod@scores, by=0)
sl_raw <- merge(raw, raw.mod@scores, by=0)
class_info <- metadata %>%
rename("Row.names" = Sample)
new_sl_raw <- left_join(sl_raw, class_info, by = "Row.names")
if_else(
anyNA(new_sl_raw) == TRUE,
paste("NAs are present in the PCA scores or metadata - double check your sample and treatment lists"),
paste("No NAs in in PCA scores or metadata")
)
# turn R2 values into percentages so they can be added to the axes
raw.sum$R2percent <- signif(raw.sum$R2, digits=3)*100
list_objects <- list(raw.sum, new_sl_raw)
return(list_objects)
}
results_for_PCA_graphs <- initial_PCA(MStype = "XCMS")
## Now you are ready to make a scores plot
draw_scores_plot <- function(PC_x, PC_y, colour_class, shape_class){
new_sl_raw <- results_for_PCA_graphs[[2]] %>%
select(PC_x, PC_y, colour_class, shape_class)
new_sl_raw$extra_class <- new_sl_raw[,ncol(new_sl_raw)]
data_for_graph <- new_sl_raw %>%
filter(str_detect(extra_class, "NA") != TRUE)
data_for_graph <- new_sl_raw
raw.sum <- results_for_PCA_graphs[[1]] %>%
rownames_to_column()
pca_theme<- theme_bw() +
theme(axis.text.x=element_text(hjust=0.5, vjust=0.5, size=12),
axis.text.y=element_text(size=12),
axis.title.x=element_text(size=12),
axis.title.y=element_text(size=12),
legend.title=element_text(size=12),
legend.background=element_blank(),
legend.key=element_blank(),
plot.background = element_blank(),
panel.grid.major = element_line(colour = NA),
panel.grid.minor = element_line(colour = NA),
panel.background = element_rect(fill = "white"),
panel.border = element_blank(), axis.line = element_line())
#graph of PCA with 95%CI ellipses coded by treatment and timepoint
graph_raw <- ggplot(data_for_graph, aes(x=get(names(data_for_graph)[1]), y=get(names(data_for_graph)[2]))) +
geom_point(aes(colour = get(names(data_for_graph)[3]), shape = get(names(data_for_graph)[4])), size = 3) +
stat_ellipse(aes(linetype=get(names(data_for_graph)[4]), colour=get(names(data_for_graph)[3]))) +
scale_colour_discrete(colour_class, na.translate = F) +
scale_shape_discrete(shape_class, na.translate = F) +
scale_linetype(shape_class, na.translate = F) +
pca_theme +
labs(x = (paste(sep = "", PC_x, "(", raw.sum$R2percent[raw.sum$rowname == PC_x], "%)")),
y = (paste(sep = "", PC_y, "(", raw.sum$R2percent[raw.sum$rowname == PC_y], "%)"))) + #will show the R2 of the specified PC - make sure they match
ggtitle(paste("PCA Scores plot")) +
theme(legend.justification = c(0,0), legend.position = c(0.9, 0.1), legend.key.size = unit(0.8,"cm"), legend.text = element_text(size = 12))
# load scores plot
return(graph_raw)
}
# use the function draw_scores_plot to make various graphs (it will automatically tell you the variance associated with the PCs you choose)
# If you only have one class (treatment group) then you still need to define both colour_class and shape_class but they can be the sames
scores_plot <- draw_scores_plot(PC_x = "PC1",
PC_y = "PC2",
colour_class = "Time",
shape_class = "Treat")
scores_plot
# option to save this plot
ggsave("Tidy_data/PCA_scores_plot.png", scores_plot + scale_colour_manual("Time", values=c("T1"="black", "T3"="tomato")))
#-------
# a PERMANOVA is another type of multivariate analysis and can help corroborate any differences in your PCA model
# adonis2 from the vegan package is used to run a PERMANOVA
# first we have to do some missing value imputation - you can adjust this if you want to but here we just replace add 1 to all intensities
# as it is very rare to have an intensity of 1
#if you don't have zeros you will not need to impute
impute <- function(x, na.rm=FALSE)(x+1)
permanova_data <- peak_table %>%
select(-Sample) #%>%
#mutate_all(impute)
# here you will need to tell the model which metadata you want in your formula:
PERMANOVA <- adonis2(permanova_data ~ AMF, metadata, permutations=999)
write.csv(PERMANOVA, "Tidy_data/PERMANOVA_output.csv")
#-------
# If you have some clustering/ separation between your classes then you can use directed analysis to pull out the features (mz__rt or mz bin)
# that have the biggest, most reliable effect on the OPLS-DA model ...
# muma package will do an OPLS-DA
### the next bit gets the peak_table (the mz values for each sample)
# into the format that muma package wants
# so needs Samples (4-5 characters, unique) and then Class (integers)
# as the first 2 columns and then all the mz slices as variables
# useful for PCA and OPLSDA theory https://metabolomics.se/Courses/MVA/MVA%20in%20Omics_Handouts_Exercises_Solutions_Thu-Fri.pdf
# if you want to do OPLS-DA you have to run muma with the original
# data file recoded so that Class only contains "1" or "2"
# This function will filter the data by the two classes you're interested in, recode them the way muma wants, and recode your
# sample names if they are too long (>5. Don't worry, it will save a file with a key to the new sample names if that's the case)
tidy_for_muma <- function(metadata_class, class_1, class_2){
temp_data <- peak_table %>%
column_to_rownames(var = "Sample")
temp1 <- metadata %>%
select(Sample, as.factor(metadata_class))
temp <- temp1 %>%
filter(!is.na(metadata_class)) %>%
mutate(old_class = get(names(temp1)[2])) %>%
mutate(old_class = as.factor(old_class))
new_class <- tibble(old_class = levels(temp$old_class)) %>%
mutate(Class = case_when(
old_class == class_1 ~ "1",
old_class == class_2 ~ "2",
old_class != class_1 & old_class != class_2 ~ "NA"
))
class_info <- temp %>%
left_join(new_class) %>%
select(Sample, Class) %>%
mutate(unique_sample = paste("S", 1:length(Sample), sep = ""))
muma_data <- peak_table %>%
left_join(class_info) %>%
filter(Class != "NA") %>%
select(Sample, unique_sample, Class, any_of(colnames(temp_data))) %>%
mutate(Sample = case_when(
str_length(Sample) <= 5 ~ Sample,
str_length(Sample) >= 6 ~ unique_sample
))
# save the data frame for use in the next step
write_csv(muma_data %>%
select(-unique_sample),
"muma_undirected_directed_analysis/data_for_muma.csv")
test_sample <- str_length(peak_table[1,1])
#add output of the samples and unique samples for reference
if (test_sample >= 6) {
write_csv(muma_data %>%
select(Sample, unique_sample),
"muma_undirected_directed_analysis/recoded_sample_names.csv")
paste("Sample names were changed, see muma_undirected_directed_analysis/recoded_sample_names.csv for the new sample names") # if true, write the file
} else {
paste("Sample names were not changed")
}
return(paste("The data frame for muma has been saved as muma_undirected_directed_analysis/data_for_muma.csv"))
}
# Run the function for the class and levels of that class that you are interested in
tidy_for_muma(metadata_class = "Drought",
class_1 = "WW",
class_2 = "DS")
# so now the data is in the right format, we let muma do all the hard work:
# for details/ help see
# ?muma
# or documentation https://rdrr.io/cran/muma/
#here we will use pareto scaling but it is worth exploring whether that is the best scaling for your data
# you will also need to adjust imputation and imput if you have MALDI data (likely to have a lot of missing values)
explore.data(file="muma_undirected_directed_analysis/data_for_muma.csv", scaling="pareto",
scal=TRUE, normalize=TRUE, imputation=FALSE,
imput="ImputType")
# the muma package saves all the output of the analysis to "muma_undirected_directed_analysis/"
Plot.pca(pcx=1, pcy=2, scaling="pareto", test.outlier=TRUE)
#repeat for any combination of PCs - muma will paste the best combinations in R
# you may get the following error but it doesn't seem to cause a problem (I generally ignore)
# Error in dist1[, 1] : argument "i" is missing, with no default
# now get muma to run the oplsda - you must specify the same type of scaling as in the PCA
oplsda(scaling="pareto")
# now these graphs are fine but you might want nicer outputs for thesis/ publication/ presentation
# so the following code accesses the scores, loadings, p and t values etc from the muma output and gets them in a format you can
# use to make ggplots
nicer_graphs_and_discriminating_variables <- function(pcorr1_min, pcorr1_max, p_top_n){
oplsda_pcorr1 <- read.csv("OPLS-DApareto/pcorr1_Matrix.csv") %>%
dplyr::rename(pcorr1=V1)
oplsda_p1_Matrix <- read.csv("OPLS-DApareto/p1_Matrix.csv") %>%
dplyr::rename(p1=V1)
loadings_column <- left_join(oplsda_p1_Matrix, oplsda_pcorr1, by="X")
loadings_matrix <- read.csv("OPLS-DApareto/PCA_OPLS/PCA_OPLS_LoadingsMatrix.csv")
variables <- loadings_matrix %>%
pull(X)
loadings_column <- loadings_column %>%
add_column(variables) %>%
select(-X) %>%
mutate(mz=str_sub(variables, 2, 9))
# Here you may want to change the level of pcorr1 you filter out - take a look at the S plot for an idea of the magnitude of effect
# You will also need to look at your S plot to see which loadings are associating with which class
top <- loadings_column %>%
arrange(desc(p1)) %>%
slice(1:p_top_n) %>%
filter(pcorr1 > pcorr1_min) # change this depending on your S plot
bottom <- loadings_column %>%
arrange(p1) %>%
slice(1:p_top_n) %>%
filter(pcorr1 < pcorr1_max) # change this depending on your S plot
# table of variables (mz or mz__rt) of interest to get putative ids for
discriminating_variables <- bind_rows(top, bottom, .id="id") %>%
dplyr::rename(association=id)
write.csv(discriminating_variables, "OPLS-DApareto/OPLSDA_discriminating_variables.csv")
pca_theme<- theme_bw() +
theme(axis.text.x=element_text(hjust=0.5, vjust=0.5, size=12),
axis.text.y=element_text(size=12),
axis.title.x=element_text(size=12),
axis.title.y=element_text(size=12),
legend.title=element_text(size=12),
legend.background=element_blank(),
legend.key=element_blank(),
plot.background = element_blank(),
panel.grid.major = element_line(colour = NA),
panel.grid.minor = element_line(colour = NA),
panel.background = element_rect(fill = "white"),
panel.border = element_blank(), axis.line = element_line())
p_x_limits <- c((min(loadings_column$p1) * 1.2 ), (max(loadings_column$p1) * 1.2 ))
p_top_n_limits <- min(top$p1)
p_bottom_n_limits <- max(bottom$p1)
new_S_plot <- ggplot(loadings_column, aes(x=p1, y=pcorr1)) +
geom_point() +
pca_theme +
scale_x_continuous(limits=c(p_x_limits), expand=c(0,0)) +
scale_y_continuous(limits=c(-1, 1.1), expand=c(0,0)) +
#geom_text(aes(label=mz, x=p1, y=(jitter((pcorr1 + 0.05), factor=1, amount=NULL)))) +
geom_rect(aes(xmin=p_top_n_limits, xmax=p_x_limits[2], ymin=pcorr1_min, ymax=1), fill="tomato", alpha=0.005) +
geom_rect(aes(xmin=p_x_limits[1], xmax=p_bottom_n_limits, ymin=pcorr1_max, ymax=-1), fill="tomato", alpha=0.005)
new_S_plot
ggsave("OPLS-DApareto/ggplot_OPLSDA_S-plot.png", new_S_plot, "png")
return(paste("S plot and discriminating variables table saved to OPLS-DApareto/ggplot_OPLSDA_S-plot.png and OPLS-DApareto/OPLSDA_discriminating_variables.csv"))
}
# Here you need to tell the function the following:
# p_top_n -> How many variables do you want the function to look at associated with each class (you may get fewer returned if they had low reliability in
# the model but it will start with the number you give it)
# p_corr1_min -> this is the minimum positive value of ~reliability in the model for the bottom end of the list of discriminating variables
# i.e. (how strict do you want to be, look at pcorr1 on S plot)
# p_corr1_max -> this is the maximum negative value of ~reliability for the bottom end of the list of discriminating variables
# These values will be slightly different in every analysis - you really do need to look at the data and think about what you want to know
nicer_graphs_and_discriminating_variables(pcorr1_min = 0.4,
pcorr1_max = -0.4,
p_top_n = 20
)