diff --git a/DIMS/AddOnFunctions/10-collectSamplesFilled_extraoutput.R b/DIMS/AddOnFunctions/10-collectSamplesFilled_extraoutput.R new file mode 100644 index 0000000..21ccdce --- /dev/null +++ b/DIMS/AddOnFunctions/10-collectSamplesFilled_extraoutput.R @@ -0,0 +1,94 @@ +#!/usr/bin/Rscript + +.libPaths(new = "/hpc/local/CentOS7/dbg_mz/R_libs/3.2.2") + +# load required packages +# none + +# define parameters +cmd_args <- commandArgs(trailingOnly = TRUE) +for (arg in cmd_args) cat(" ", arg, "\n") + +outdir <- cmd_args[1] +scanmode <- cmd_args[2] +normalization <- cmd_args[3] +scripts <- cmd_args[4] +z_score <- as.numeric(cmd_args[5]) +ppm <- as.numeric(cmd_args[6]) + +#outdir <- "/Users/nunen/Documents/Metab/processed/test_old" +#scanmode <- "negative" +#normalization <- "disabled" +#scripts <- "/Users/nunen/Documents/Metab/DIMS/scripts" +#db <- "/Users/nunen/Documents/Metab/DIMS/db/HMDB_add_iso_corrNaCl_withIS_withC5OH.RData" +#z_score <- 0 + +object.files = list.files(paste(outdir, "9-samplePeaksFilled", sep="/"), full.names=TRUE, pattern=scanmode) +outlist.tot=NULL +for (i in 1:length(object.files)) { + load(object.files[i]) + print(print(object.files[i])) + outlist.tot = rbind(outlist.tot, final.outlist.idpat3) +} + +source(paste(scripts, "AddOnFunctions/sourceDir.R", sep="/")) +sourceDir(paste(scripts, "AddOnFunctions", sep="/")) + +# remove duplicates +outlist.tot = mergeDuplicatedRows(outlist.tot) + +# sort on mass +outlist.tot = outlist.tot[order(outlist.tot[,"mzmed.pgrp"]),] + +# normalization +load(paste0(outdir, "/repl.pattern.",scanmode,".RData")) + +if (normalization != "disabled") { + outlist.tot = normalization_2.1(outlist.tot, fileName, names(repl.pattern.filtered), on=normalization, assi_label="assi_HMDB") +} + +if (z_score == 1) { + outlist.stats = statistics_z(outlist.tot, sortCol=NULL, adducts=FALSE) + nr.removed.samples=length(which(repl.pattern.filtered[]=="character(0)")) + order.index.int=order(colnames(outlist.stats)[8:(length(repl.pattern.filtered)-nr.removed.samples+7)]) + outlist.stats.more = cbind(outlist.stats[,1:7], + outlist.stats[,(length(repl.pattern.filtered)-nr.removed.samples+8):(length(repl.pattern.filtered)-nr.removed.samples+8+6)], + outlist.stats[,8:(length(repl.pattern.filtered)-nr.removed.samples+7)][order.index.int], + outlist.stats[,(length(repl.pattern.filtered)-nr.removed.samples+5+10):ncol(outlist.stats)]) + + tmp.index=grep("_Zscore", colnames(outlist.stats.more), fixed = TRUE) + tmp.index.order=order(colnames(outlist.stats.more[,tmp.index])) + tmp = outlist.stats.more[,tmp.index[tmp.index.order]] + outlist.stats.more=outlist.stats.more[,-tmp.index] + outlist.stats.more=cbind(outlist.stats.more,tmp) + outlist.tot = outlist.stats.more +} + +# filter identified compounds +index.1=which((outlist.tot[,"assi_HMDB"]!="") & (!is.na(outlist.tot[,"assi_HMDB"]))) +index.2=which((outlist.tot[,"iso_HMDB"]!="") & (!is.na(outlist.tot[,"iso_HMDB"]))) +index=union(index.1,index.2) +outlist.ident = outlist.tot[index,] +outlist.not.ident = outlist.tot[-index,] + +if (z_score == 1) { + outlist.ident$ppmdev=as.numeric(outlist.ident$ppmdev) + outlist.ident <- outlist.ident[which(outlist.ident["ppmdev"] >= -ppm & outlist.ident["ppmdev"] <= ppm),] +} +# NAs in theormz_noise <======================================================================= uitzoeken!!! +outlist.ident$theormz_noise[which(is.na(outlist.ident$theormz_noise))] = 0 +outlist.ident$theormz_noise=as.numeric(outlist.ident$theormz_noise) +outlist.ident$theormz_noise[which(is.na(outlist.ident$theormz_noise))] = 0 +outlist.ident$theormz_noise=as.numeric(outlist.ident$theormz_noise) + +save(outlist.not.ident, outlist.ident, file=paste(outdir, "/outlist_identified_", scanmode, ".RData", sep="")) + +# Extra output in Excel-readable format: +remove_columns <- c("fq.best", "fq.worst", "mzmin.pgrp", "mzmax.pgrp") +remove_colindex <- which(colnames(outlist.ident) %in% remove_columns) +outlist.ident <- outlist.ident[ , -remove_colindex] +write.table(outlist.ident, file=paste0(outdir, "/outlist_identified_", scanmode, ".txt"), sep="\t", row.names = FALSE) +remove_colindex <- which(colnames(outlist.not.ident) %in% remove_columns) +outlist.not.ident <- outlist.not.ident[ , -remove_colindex] +write.table(outlist.not.ident, file=paste0(outdir, "/outlist_not_identified_", scanmode, ".txt"), sep="\t", row.names = FALSE) + diff --git a/DIMS/AddOnFunctions/add_lab_id_and_onderzoeksnummer.R b/DIMS/AddOnFunctions/add_lab_id_and_onderzoeksnummer.R new file mode 100644 index 0000000..9d91a55 --- /dev/null +++ b/DIMS/AddOnFunctions/add_lab_id_and_onderzoeksnummer.R @@ -0,0 +1,10 @@ +add_lab_id_and_onderzoeksnummer <- function(df_metabs_helix) { + # Split patient number into labnummer and Onderzoeksnummer + for (row in 1:nrow(df_metabs_helix)) { + df_metabs_helix[row,"labnummer"] <- gsub("^P|\\.[0-9]*", "", df_metabs_helix[row,"Patient"]) + labnummer_split <- strsplit(as.character(df_metabs_helix[row, "labnummer"]), "M")[[1]] + df_metabs_helix[row, "Onderzoeksnummer"] <- paste0("MB", labnummer_split[1], "/", labnummer_split[2]) + } + + return(df_metabs_helix) +} diff --git a/DIMS/AddOnFunctions/checkOverlap.R b/DIMS/AddOnFunctions/checkOverlap.R new file mode 100644 index 0000000..f7ef818 --- /dev/null +++ b/DIMS/AddOnFunctions/checkOverlap.R @@ -0,0 +1,16 @@ +checkOverlap <- function(range1,range2){ + if (length(intersect(range1,range2))==2) { + # Overlap + # message("Overlap, smaller range is used") + if (length(range1) >= length(range2)){ + range1=range1[-length(range1)] + } else { + range2=range2[-1] + } + } else if (length(intersect(range1,range2))==3){ + # message("Overlap, smaller range is used") + range1=range1[-length(range1)] + range2=range2[-1] + } + return(list("range1"=range1,"range2"=range2)) +} diff --git a/DIMS/AddOnFunctions/check_same_samplename.R b/DIMS/AddOnFunctions/check_same_samplename.R new file mode 100644 index 0000000..16ae9cf --- /dev/null +++ b/DIMS/AddOnFunctions/check_same_samplename.R @@ -0,0 +1,4 @@ +# function to test whether intensity and Z-score columns match +check_same_samplename <- function(int_col_name, zscore_col_name) { + paste0(int_col_name, "_Zscore") == zscore_col_name +} diff --git a/DIMS/AddOnFunctions/create_violin_plots.R b/DIMS/AddOnFunctions/create_violin_plots.R new file mode 100644 index 0000000..df398be --- /dev/null +++ b/DIMS/AddOnFunctions/create_violin_plots.R @@ -0,0 +1,116 @@ +create_violin_plots <- function(pdf_dir, pt_name, metab_perpage, top_metab_pt=NULL) { + + # set parameters for plots + plot_height <- 9.6 + plot_width <- 6 + fontsize <- 1 + nr_plots_perpage <- 20 + circlesize <- 0.8 + colors_4plot <- c("#22E4AC", "#00B0F0", "#504FFF","#A704FD","#F36265","#DA0641") + # green blue blue/purple purple orange red + + # patient plots, create the PDF device + pt_name_sub <- pt_name + suffix <- "" + if (grepl("Diagnostics", pdf_dir) & is_diagnostic_patient(pt_name)) { + prefix <- "MB" + suffix <- "_DIMS_PL_DIAG" + # substitute P and M in P2020M00001 into right format for Helix + pt_name_sub <- gsub("[PM]", "", pt_name) + pt_name_sub <- gsub("\\..*", "", pt_name_sub) + } else if (grepl("Diagnostics", pdf_dir)) { + prefix <- "Dx_" + } else if (grepl("IEM", pdf_dir)) { + prefix <- "IEM_" + } else { + prefix <- "R_" + } + + pdf(paste0(pdf_dir, "/", prefix, pt_name_sub, suffix, ".pdf"), + onefile = TRUE, + width = plot_width, + height = plot_height) + + # page headers: + page_headers <- names(metab_perpage) + + # put table into PDF file, if not empty + if (!is.null(dim(top_metab_pt))) { + plot.new() + # get the names and numbers in the table aligned + table_theme <- ttheme_default(core = list(fg_params = list(hjust=0, x=0.05, fontsize=6)), + colhead = list(fg_params = list(fontsize=8, fontface="bold"))) + grid.table(top_metab_pt, theme = table_theme, rows = NULL) + # g <- tableGrob(top_metab_pt) + # grid.draw(g) + text(x=0.45, y=1.02, paste0("Top deviating metabolites for patient: ", pt_name), font=1, cex=1) + } + + # violin plots + for (page_index in 1:length(metab_perpage)) { + # extract list of metabolites to plot on a page + metab_list_2plot <- metab_perpage[[page_index]] + # extract original data for patient of interest (pt_name) before cut-offs + pt_list_2plot_orig <- metab_list_2plot[which(metab_list_2plot$variable == pt_name), ] + # cut off Z-scores higher than 20 or lower than -5 (for nicer plots) + metab_list_2plot$value[metab_list_2plot$value > 20] <- 20 + metab_list_2plot$value[metab_list_2plot$value < -5] <- -5 + # extract data for patient of interest (pt_name) + pt_list_2plot <- metab_list_2plot[which(metab_list_2plot$variable == pt_name), ] + # restore original Z-score before cut-off, for showing Z-scores in PDF + pt_list_2plot$value_orig <- pt_list_2plot_orig$value + # remove patient of interest (pt_name) from list; violins will be made up of controls and other patients + metab_list_2plot <- metab_list_2plot[-which(metab_list_2plot$variable == pt_name), ] + # subtitle per page + sub_perpage <- gsub("_", " ", page_headers[page_index]) + # for IEM plots, put subtitle on two lines + sub_perpage <- gsub("probability", "\nprobability", sub_perpage) + # add size parameter for showing Z-score of patient per metabolite + Z_size <- rep(3, nrow(pt_list_2plot)) + # set size to 0 if row is empty + Z_size[is.na(pt_list_2plot$value)] <- 0 + + # draw violin plot. shape=22 gives square for patient of interest + ggplot_object <- ggplot(metab_list_2plot, aes(x=value, y=HMDB_name)) + + theme(axis.text.y=element_text(size=rel(fontsize)), plot.caption = element_text(size=rel(fontsize))) + + xlim(-5, 20) + + geom_violin(scale="width") + + geom_point(data = pt_list_2plot, aes(color=value), size = 3.5*circlesize, shape=22, fill="white") + + scale_fill_gradientn(colors = colors_4plot, values = NULL, space = "Lab", na.value = "grey50", guide = "colourbar", aesthetics = "colour") + + # add Z-score value for patient of interest at x=16 + geom_text(data = pt_list_2plot, aes(16, label = paste0("Z=", round(value_orig, 2))), hjust = "left", vjust = +0.2, size = Z_size) + + # add labels. Use font Courier to get all the plots in the same location. + labs(x = "Z-scores", y = "Metabolites", subtitle = sub_perpage, color = "z-score") + + theme(axis.text.y = element_text(family = "Courier", size=6)) + + # do not show legend + theme(legend.position="none") + + # add title + ggtitle(label = paste0("Results for patient ", pt_name)) + + # labs(x = "Z-scores", y = "Metabolites", title = paste0("Results for patient ", pt_name), subtitle = sub_perpage, color = "z-score") + + # add vertical lines + geom_vline(xintercept = 2, col = "grey", lwd = 0.5, lty=2) + + geom_vline(xintercept = -2, col = "grey", lwd = 0.5, lty=2) + + suppressWarnings(print(ggplot_object)) + + } + + # add explanation of violin plots, version number etc. + # plot.new() + plot(NA, xlim=c(0,5), ylim=c(0,5), bty='n', xaxt='n', yaxt='n', xlab='', ylab='') + if (length(explanation) > 0) { + text(0.2, 5, explanation[1], pos=4, cex=0.8) + for (line_index in 2:length(explanation)) { + text_y_position <- 5 - (line_index*0.2) + text(-0.2, text_y_position, explanation[line_index], pos=4, cex=0.5) + } + # full_explanation <- paste(explanation[2:length(explanation)], sep=" \n") + # text(0.2, 4, full_explanation, pos=4, cex=0.6) + #explanation_grob=textGrob(apply(full_explanation, 2, paste, collapse="\n")) + #grid.arrange(explanation_grob) + } + + # close the PDF file + dev.off() + +} diff --git a/DIMS/AddOnFunctions/elementInfo.R b/DIMS/AddOnFunctions/elementInfo.R new file mode 100644 index 0000000..18eaeff --- /dev/null +++ b/DIMS/AddOnFunctions/elementInfo.R @@ -0,0 +1,6 @@ +elementInfo <- function(name, elements = NULL) { # from Rdisop function .getElement + if (!is.list(elements) || length(elements)==0 ) { + elements <- initializePSE() } + if (name=="CH3OH+H"){rex<-"^CH3OH\\+H$"}else{rex <- paste ("^",name,"$", sep="")} + elements [[grep (rex, sapply (elements, function(x) {x$name}))]] +} \ No newline at end of file diff --git a/DIMS/AddOnFunctions/export.R b/DIMS/AddOnFunctions/export.R new file mode 100644 index 0000000..cab79a0 --- /dev/null +++ b/DIMS/AddOnFunctions/export.R @@ -0,0 +1,13 @@ +export <- function(peaklist, plotdir, adducts, control_label, case_label, patients, sub, fileName){ + # peaklist = outlist.adducts + # adducts=TRUE + # control_label="C" + # case_label="P" + # patients = getPatients(outlist.adducts) + # sub=3000 + + # peaklist = statistics_z_4export(as.data.frame(peaklist), plotdir, patients, adducts, control_label, case_label) + + # generateExcelFile(peaklist, file.path(plotdir), imageNum=2, fileName, subName=c("","_box"), sub, adducts) + +} diff --git a/DIMS/AddOnFunctions/findPeaks.Gauss.HPC.R b/DIMS/AddOnFunctions/findPeaks.Gauss.HPC.R new file mode 100644 index 0000000..c00aa92 --- /dev/null +++ b/DIMS/AddOnFunctions/findPeaks.Gauss.HPC.R @@ -0,0 +1,24 @@ +### fit Gaussian estimate mean and integrate to obtain intensity +findPeaks.Gauss.HPC <- function(plist, breaks.fwhm, int.factor, scale, resol, outdir, scanmode, plot, thresh, width, height) { + sampname <- colnames(plist)[1] + + range <- as.vector(plist) + names(range) <- rownames(plist) + + values <- list("mean"=NULL, "area"=NULL, "nr"=NULL, "min"=NULL, "max"=NULL, "qual"=NULL, "spikes"=0) + + values <- searchMZRange(range, values, int.factor, scale, resol, outdir, sampname, scanmode, plot, width, height, thresh) + + outlist.persample <- NULL + outlist.persample <- cbind("samplenr"=values$nr, "mzmed.pkt"=values$mean, "fq"=values$qual, "mzmin.pkt"=values$min, "mzmax.pkt"=values$max, "height.pkt"=values$area) + + index <- which(outlist.persample[ ,"height.pkt"]==0) + if (length(index) > 0) { + outlist.persample <- outlist.persample[-index,] + } + + # save(outlist.persample, file=paste(outdir, paste(sampname, "_", scanmode, ".RData", sep=""), sep="/")) + save(outlist.persample, file=paste("./", sampname, "_", scanmode, ".RData", sep="")) + + cat(paste("There were", values$spikes, "spikes!")) +} diff --git a/DIMS/AddOnFunctions/fit1Peak.R b/DIMS/AddOnFunctions/fit1Peak.R new file mode 100644 index 0000000..b09ce20 --- /dev/null +++ b/DIMS/AddOnFunctions/fit1Peak.R @@ -0,0 +1,120 @@ +fit1Peak <- function(x2,x,y,index,scale,resol,plot,FQ,useBounds) { + #FQ=FQ1 + + if (length(y)<3){ + message("Range to small, no fit possible!") + } else { + + if ((length(y)==4)) { + mu = weighted.mean(x,y) + sigma = getSD(x,y) + fitP = fitG_2(x,y,sigma,mu,scale,useBounds) + } else { + + if ((length(x) - length(index)) < 2) { + range1=c((length(x)-4):length(x)) + } else if (length(index) < 2) { + range1=c(1:5) + } else { + range1=c(index[1]-2,index[1]-1,index[1],index[1]+1,index[1]+2) + } + + if (range1[1]==0) range1=range1[-1] + + # remove NA + if (length(which(is.na(y[range1])))!=0) range1=range1[-which(is.na(y[range1]))] + + mu = weighted.mean(x[range1],y[range1]) + sigma = getSD(x[range1],y[range1]) + fitP = fitG_2(x,y,sigma,mu,scale,useBounds) + } + + p2 = fitP$par + + #fq_new = abs(sum(y) - sum(p2[2]*dnorm(x,p2[1],sigma)))/sum(y) + fq_new = getFitQuality(x,y,p2[1],p2[1],resol,p2[2],sigma)$fq_new + + if (plot & (fq_new < FQ)) lines(x2,p2[2]*dnorm(x2,p2[1],sigma), col="green") + + scale_new = 1.2*scale + # cat(fq_new) + + if (fq_new > FQ) { # <=== bad fit? + # optimize scaling factor + fq = 0 + scale = 0 + + if (sum(y)>sum(p2[2]*dnorm(x,p2[1],sigma))){ + while ((round(fq, digits = 3) != round(fq_new, digits = 3)) & (scale_new<10000)) { + fq = fq_new + scale = scale_new + + #cat(scale) + fitP = fitG_2(x,y,sigma,mu,scale,useBounds) + p2 = fitP$par + + #fq_new = abs(sum(y) - sum(p2[2]*dnorm(x,p2[1],sigma)))/sum(y) + fq_new = getFitQuality(x,y,p2[1],p2[1],resol,p2[2],sigma)$fq_new + scale_new=1.2*scale + + if (plot & (fq_new < FQ)) lines(x2,p2[2]*dnorm(x2,p2[1],sigma), col="green") + # cat(paste("fq_new: ", fq_new)) + # cat(paste("scale_new: ", scale_new)) + # (round(fq, digits = 4) != round(fq_new, digits = 4)) + } + } else { + while ((round(fq, digits = 3) != round(fq_new, digits = 3)) & (scale_new<10000)) { + fq = fq_new + scale = scale_new + + # cat(scale) + fitP = fitG_2(x,y,sigma,mu,scale,useBounds) + p2 = fitP$par + + #fq_new = abs(sum(y) - sum(p2[2]*dnorm(x,p2[1],sigma)))/sum(y) + fq_new = getFitQuality(x,y,p2[1],p2[1],resol,p2[2],sigma)$fq_new + scale_new=0.8*scale + + if (plot & (fq_new < FQ)) lines(x2,p2[2]*dnorm(x2,p2[1],sigma), col="green") + # cat(paste("fq_new: ", fq_new)) + } + } + + if (fq < fq_new) { + # cat(paste("fq_new: ", fq_new)) + # cat(paste("fq: ", fq)) + # cat(paste("scale_new: ", scale_new)) + # cat(paste("scale: ", scale)) + + fitP = fitG_2(x,y,sigma,mu,scale,useBounds) + p2 = fitP$par + fq_new = fq + # cat(paste("==> fq_new: ", fq_new)) + if (plot & (fq_new < FQ)) lines(x2,p2[2]*dnorm(x2,p2[1],sigma), col="dark green") + + } + } + + if (plot & (fq_new < FQ)) { + # plot ################### + #lines(x2,p2[2]*dnorm(x2,p2[1],sigma), col="green") + fwhm = getFwhm(p2[1],resol) + half_max = max(p2[2]*dnorm(x2,p2[1],sigma))*0.5 + lines(c(p2[1] - 0.5*fwhm, p2[1] + 0.5*fwhm),c(half_max,half_max),col="orange") + abline(v = p2[1], col="green") + h=c(paste("mean =", p2[1], sep=" "), + paste("fq =", fq_new, sep=" ")) + legend("topright", legend=h) + ########################## + + + # abline(v = x[6], col="red") + # fwhm = getFwhm(x[6]) + # abline(v =x[6] + 0.6*fwhm, col="red") + # abline(v =x[6] - 0.6*fwhm, col="red") + + } + } + + return(list("mean"=p2[1], "scale"=p2[2], "sigma"=sigma, "qual"=fq_new)) +} diff --git a/DIMS/AddOnFunctions/fit2G.R b/DIMS/AddOnFunctions/fit2G.R new file mode 100644 index 0000000..9dc76dd --- /dev/null +++ b/DIMS/AddOnFunctions/fit2G.R @@ -0,0 +1,43 @@ +fit2G_2 <- function(x,y,sig1,sig2,mu1,scale1,mu2,scale2,useBounds){ + + f = function(p){ + d = p[2]*dnorm(x,mean=p[1],sd=sig1) + p[4]*dnorm(x,mean=p[3],sd=sig2) + sum((d-y)^2) + } + + if (useBounds){ + lower = c(x[1],0,x[1],0) + upper = c(x[length(x)],Inf,x[length(x)],Inf) + + if (is.null(mu2) && is.null(scale2) && is.null(sig2)){ + sig2=sig1 + optim(c(as.numeric(mu1), + as.numeric(scale1), + as.numeric(mu1), + as.numeric(scale1)), + f,control=list(maxit=10000),method="L-BFGS-B",lower=lower,upper=upper) + } else { + optim(c(as.numeric(mu1), + as.numeric(scale1), + as.numeric(mu2), + as.numeric(scale2)), + f,control=list(maxit=10000),method="L-BFGS-B",lower=lower,upper=upper) + } + + } else { + if (is.null(mu2) && is.null(scale2) && is.null(sig2)){ + sig2=sig1 + optim(c(as.numeric(mu1), + as.numeric(scale1), + as.numeric(mu1), + as.numeric(scale1)), + f,control=list(maxit=10000)) + } else{ + optim(c(as.numeric(mu1), + as.numeric(scale1), + as.numeric(mu2), + as.numeric(scale2)), + f,control=list(maxit=10000)) + } + } +} diff --git a/DIMS/AddOnFunctions/fit2peaks.R b/DIMS/AddOnFunctions/fit2peaks.R new file mode 100644 index 0000000..4d9c64d --- /dev/null +++ b/DIMS/AddOnFunctions/fit2peaks.R @@ -0,0 +1,100 @@ +fit2peaks <- function(x2,x,y,index,scale,resol,useBounds=FALSE,plot=FALSE,FQ,int.factor){ + + peak.mean = NULL + peak.area = NULL + peak.scale = NULL + peak.sigma = NULL + + range1=c(index[1]-2,index[1]-1,index[1],index[1]+1,index[1]+2) + if (range1[1]==0) range1=range1[-1] + + range2=c(index[2]-2,index[2]-1,index[2],index[2]+1,index[2]+2) + + if (length(x)0) range1=range1[-remove] + remove=which(range2<1) + if (length(remove)>0) range2=range2[-remove] + + # remove NA + if (length(which(is.na(y[range1])))!=0) range1=range1[-which(is.na(y[range1]))] + if (length(which(is.na(y[range2])))!=0) range2=range2[-which(is.na(y[range2]))] + + mu1 = weighted.mean(x[range1],y[range1]) + sigma1 = getSD(x[range1],y[range1]) + + # message(paste("fit2peaks mu =>", mu1)) + # message(paste("fit2peaks sigma =>", sigma1)) + # message(paste("fit2peaks scale =>", scale)) + + fitP = fitG_2(x[range1],y[range1],sigma1,mu1,scale,useBounds) + p = fitP$par + + mu2 = weighted.mean(x[range2],y[range2]) + sigma2 = getSD(x[range2],y[range2]) + fitP = fitG_2(x[range2],y[range2],sigma2,mu2,scale,useBounds) + p2 = fitP$par + + fit2P = fit2G_2(x, y, sigma1, sigma2, p[1], p[2], p2[1], p2[2],useBounds) + p3 = fit2P$par + + if (is.null(sigma2)) sigma2=sigma1 + + + # plot ################### + sumFit2 = (p3[2]*dnorm(x2,p3[1],sigma1))+(p3[4]*dnorm(x2,p3[3],sigma2)) + sumFit = (p3[2]*dnorm(x,p3[1],sigma1))+(p3[4]*dnorm(x,p3[3],sigma2)) + fq=getFitQuality(x,y,sort(c(p3[1],p3[3]))[1],sort(c(p3[1],p3[3]))[2],resol,sumFit=sumFit)$fq_new + + fwhm = getFwhm(p3[1],resol) + half_max = max(p3[2]*dnorm(x2,p3[1],sigma1))*0.5 + if (plot & (fq < FQ)) lines(c(p3[1] - 0.5*fwhm, p3[1] + 0.5*fwhm),c(half_max,half_max),col="orange") + if (plot & (fq < FQ)) lines(x2,p3[4]*dnorm(x2,p3[3],sigma2),col="grey") + if (plot & (fq < FQ)) abline(v = p3[3], col="grey") + + fwhm = getFwhm(p3[3],resol) + half_max = max(p3[4]*dnorm(x2,p3[3],sigma2))*0.5 + if (plot & (fq < FQ)) lines(c(p3[3] - 0.5*fwhm, p3[3] + 0.5*fwhm),c(half_max,half_max),col="orange") + + if (plot & (fq < FQ)) lines(x2,sumFit2,col="black") + + if (plot & (fq < FQ)) lines(x2,p3[2]*dnorm(x2,p3[1],sigma1),col="grey") + if (plot & (fq < FQ)) abline(v = p3[1], col="grey") + + + h2=c(paste("mean =", p3[1], sep=" "), + paste("mean =", p3[3], sep=" "), + paste("fq =", fq, sep=" ")) + + if (plot & (fq < FQ)) legend("topright", legend=h2) + ########################## + + # lines(x2,p3[4]*dnorm(x2,p3[3],sigma2),col="red") + # area1 = sum(p3[2]*dnorm(x2,p3[1],sigma1)) + # area2 = sum(p3[4]*dnorm(x2,p3[3],sigma2)) + + # area1 = max(p3[2]*dnorm(x2,p3[1],sigma1)) + # area2 = max(p3[4]*dnorm(x2,p3[3],sigma2)) + + area1 = getArea(p3[1],resol,p3[2],sigma1,int.factor) + area2 = getArea(p3[3],resol,p3[4],sigma2,int.factor) + + peak.area = c(peak.area, area1) + peak.area = c(peak.area, area2) + + peak.mean = c(peak.mean, p3[1]) + peak.mean = c(peak.mean, p3[3]) + + peak.scale = c(peak.scale, p3[2]) + peak.scale = c(peak.scale, p3[4]) + + peak.sigma = c(peak.sigma, sigma1) + peak.sigma = c(peak.sigma, sigma2) + + return(list("mean"=peak.mean, "scale"=peak.scale, "sigma"=peak.sigma, "area"=peak.area, "qual"=fq)) +} \ No newline at end of file diff --git a/DIMS/AddOnFunctions/fit3G.R b/DIMS/AddOnFunctions/fit3G.R new file mode 100644 index 0000000..05543c5 --- /dev/null +++ b/DIMS/AddOnFunctions/fit3G.R @@ -0,0 +1,18 @@ +fit3G_2 <- function(x,y,sig1,sig2,sig3,mu1,scale1,mu2,scale2,mu3,scale3,useBounds){ + + f = function(p){ + d = p[2]*dnorm(x,mean=p[1],sd=sig1) + p[4]*dnorm(x,mean=p[3],sd=sig2) + p[6]*dnorm(x,mean=p[5],sd=sig3) + sum((d-y)^2) + } + + if (useBounds){ + lower = c(x[1],0,x[1],0,x[1],0) + upper = c(x[length(x)],Inf,x[length(x)],Inf,x[length(x)],Inf) + + optim(c(mu1,scale1,mu2,scale2,mu3,scale3),f,control=list(maxit=10000),method="L-BFGS-B",lower=lower,upper=upper) + + } else { + optim(c(mu1,scale1,mu2,scale2,mu3,scale3),f,control=list(maxit=10000)) + } + +} diff --git a/DIMS/AddOnFunctions/fit3peaks.R b/DIMS/AddOnFunctions/fit3peaks.R new file mode 100644 index 0000000..e373f14 --- /dev/null +++ b/DIMS/AddOnFunctions/fit3peaks.R @@ -0,0 +1,122 @@ +fit3peaks <- function(x2,x,y,index,scale,resol,useBounds=FALSE,plot=FALSE,FQ,int.factor){ + + peak.mean = NULL + peak.area = NULL + peak.scale = NULL + peak.sigma = NULL + + range1=c(index[1]-2,index[1]-1,index[1],index[1]+1,index[1]+2) + range2=c(index[2]-2,index[2]-1,index[2],index[2]+1,index[2]+2) + range3=c(index[3]-2,index[3]-1,index[3],index[3]+1,index[3]+2) + + remove=which(range1<1) + if (length(remove)>0) { + range1=range1[-remove] + } + remove=which(range2<1) + if (length(remove)>0) { + range2=range2[-remove] + } + + if (length(x)0) range1=range1[-remove] + remove=which(range2<1) + if (length(remove)>0) range2=range2[-remove] + remove=which(range3<1) + if (length(remove)>0) range3=range3[-remove] + + # remove NA + if (length(which(is.na(y[range1])))!=0) range1=range1[-which(is.na(y[range1]))] + if (length(which(is.na(y[range2])))!=0) range2=range2[-which(is.na(y[range2]))] + if (length(which(is.na(y[range3])))!=0) range3=range3[-which(is.na(y[range3]))] + + mu1 = weighted.mean(x[range1],y[range1]) + sigma1 = getSD(x[range1],y[range1]) + fitP = fitG_2(x[range1],y[range1],sigma1,mu1,scale,useBounds) + p = fitP$par + + mu2 = weighted.mean(x[range2],y[range2]) + sigma2 = getSD(x[range2],y[range2]) + fitP = fitG_2(x[range2],y[range2],sigma2,mu2,scale,useBounds) + p2 = fitP$par + + mu3 = weighted.mean(x[range3],y[range3]) + sigma3 = getSD(x[range3],y[range3]) + fitP = fitG_2(x[range3],y[range3],sigma3,mu3,scale,useBounds) + p3 = fitP$par + + fit3P = fit3G_2(x, y, sigma1, sigma2, sigma3, p[1], p[2], p2[1], p2[2], p3[1], p3[2], useBounds) + p4 = fit3P$par + + # plot ############################## + sumFit2 = (p4[2]*dnorm(x2,p4[1],sigma1))+(p4[4]*dnorm(x2,p4[3],sigma2))+(p4[6]*dnorm(x2,p4[5],sigma3)) + sumFit = (p4[2]*dnorm(x,p4[1],sigma1))+(p4[4]*dnorm(x,p4[3],sigma2))+(p4[6]*dnorm(x,p4[5],sigma3)) + fq=getFitQuality(x,y,sort(c(p4[1],p4[3],p4[5]))[1],sort(c(p4[1],p4[3],p4[5]))[3],resol,sumFit=sumFit)$fq_new + + if (plot & (fq < FQ)) lines(x2,p4[2]*dnorm(x2,p4[1],sigma1),col="yellow") + if (plot & (fq < FQ)) abline(v = p4[1], col="yellow") + fwhm = getFwhm(p4[1],resol) + half_max = max(p4[2]*dnorm(x2,p4[1],sigma1))*0.5 + if (plot & (fq < FQ)) lines(c(p4[1] - 0.5*fwhm, p4[1] + 0.5*fwhm),c(half_max,half_max),col="orange") + + if (plot & (fq < FQ)) lines(x2,p4[4]*dnorm(x2,p4[3],sigma2),col="yellow") + if (plot & (fq < FQ)) abline(v = p4[3], col="yellow") + fwhm = getFwhm(p4[3],resol) + half_max = max(p4[4]*dnorm(x2,p4[3],sigma2))*0.5 + if (plot & (fq < FQ)) lines(c(p4[3] - 0.5*fwhm, p4[3] + 0.5*fwhm),c(half_max,half_max),col="orange") + + if (plot & (fq < FQ)) lines(x2,p4[6]*dnorm(x2,p4[5],sigma3),col="yellow") + if (plot & (fq < FQ)) abline(v = p4[5], col="yellow") + fwhm = getFwhm(p4[5],resol) + half_max = max(p4[6]*dnorm(x2,p4[5],sigma3))*0.5 + if (plot & (fq < FQ)) lines(c(p4[5] - 0.5*fwhm, p4[5] + 0.5*fwhm),c(half_max,half_max),col="orange") + + if (plot & (fq < FQ)) lines(x2,sumFit2,col="red") + + h2=c(paste("mean =", p4[1], sep=" "), + paste("mean =", p4[3], sep=" "), + paste("mean =", p4[5], sep=" "), + paste("fq =", fq, sep=" ")) + + if (plot & (fq < FQ)) legend("topright", legend=h2) + ######################################### + + # area1 = sum(p4[2]*dnorm(x2,p4[1],sigma1)) + # area2 = sum(p4[4]*dnorm(x2,p4[3],sigma2)) + # area3 = sum(p4[6]*dnorm(x2,p4[5],sigma3)) + + # area1 = max(p4[2]*dnorm(x2,p4[1],sigma1)) + # area2 = max(p4[4]*dnorm(x2,p4[3],sigma2)) + # area3 = max(p4[6]*dnorm(x2,p4[5],sigma3)) + + area1 = getArea(p4[1],resol,p4[2],sigma1,int.factor) + area2 = getArea(p4[3],resol,p4[4],sigma2,int.factor) + area3 = getArea(p4[5],resol,p4[6],sigma3,int.factor) + + peak.area = c(peak.area, area1) + peak.area = c(peak.area, area2) + peak.area = c(peak.area, area3) + + peak.mean = c(peak.mean, p4[1]) + peak.mean = c(peak.mean, p4[3]) + peak.mean = c(peak.mean, p4[5]) + + peak.scale = c(peak.scale, p4[2]) + peak.scale = c(peak.scale, p4[4]) + peak.scale = c(peak.scale, p4[6]) + + peak.sigma = c(peak.sigma, sigma1) + peak.sigma = c(peak.sigma, sigma2) + peak.sigma = c(peak.sigma, sigma3) + + return(list("mean"=peak.mean, "scale"=peak.scale, "sigma"=peak.sigma, "area"=peak.area, "qual"=fq)) + +} \ No newline at end of file diff --git a/DIMS/AddOnFunctions/fit4G.R b/DIMS/AddOnFunctions/fit4G.R new file mode 100644 index 0000000..6cad63d --- /dev/null +++ b/DIMS/AddOnFunctions/fit4G.R @@ -0,0 +1,18 @@ +fit4G_2 <- function(x,y,sig1,sig2,sig3,sig4,mu1,scale1,mu2,scale2,mu3,scale3,mu4,scale4,useBounds){ + + f = function(p){ + d = p[2]*dnorm(x,mean=p[1],sd=sig1) + p[4]*dnorm(x,mean=p[3],sd=sig2) + p[6]*dnorm(x,mean=p[5],sd=sig3) + p[8]*dnorm(x,mean=p[7],sd=sig4) + sum((d-y)^2) + } + + if (useBounds){ + lower = c(x[1],0,x[1],0,x[1],0,x[1],0) + upper = c(x[length(x)],Inf,x[length(x)],Inf,x[length(x)],Inf,x[length(x)],Inf) + + optim(c(mu1,scale1,mu2,scale2,mu3,scale3,mu4,scale4),f,control=list(maxit=10000),method="L-BFGS-B",lower=lower,upper=upper) + + } else { + optim(c(mu1,scale1,mu2,scale2,mu3,scale3,mu4,scale4),f,control=list(maxit=10000)) + } + +} diff --git a/DIMS/AddOnFunctions/fit4peaks.R b/DIMS/AddOnFunctions/fit4peaks.R new file mode 100644 index 0000000..a506541 --- /dev/null +++ b/DIMS/AddOnFunctions/fit4peaks.R @@ -0,0 +1,149 @@ +fit4peaks <- function(x2,x,y,index,scale,resol,useBounds=FALSE,plot=FALSE,FQ,int.factor) { + + peak.mean = NULL + peak.area = NULL + peak.scale = NULL + peak.sigma = NULL + + range1=c(index[1]-2,index[1]-1,index[1],index[1]+1,index[1]+2) + range2=c(index[2]-2,index[2]-1,index[2],index[2]+1,index[2]+2) + range3=c(index[3]-2,index[3]-1,index[3],index[3]+1,index[3]+2) + range4=c(index[4]-2,index[4]-1,index[4],index[4]+1,index[4]+2) + if (range1[1]==0) range1=range1[-1] + if (length(x)length(x)) + if (length(remove)>0) { + range4=range4[-remove] + # message(length(range4)) + } + + # check for negative or 0 + remove=which(range1<1) + if (length(remove)>0) range1=range1[-remove] + remove=which(range2<1) + if (length(remove)>0) range2=range2[-remove] + remove=which(range3<1) + if (length(remove)>0) range3=range3[-remove] + remove=which(range4<1) + if (length(remove)>0) range4=range4[-remove] + + # remove NA + if (length(which(is.na(y[range1])))!=0) range1=range1[-which(is.na(y[range1]))] + if (length(which(is.na(y[range2])))!=0) range2=range2[-which(is.na(y[range2]))] + if (length(which(is.na(y[range3])))!=0) range3=range3[-which(is.na(y[range3]))] + if (length(which(is.na(y[range4])))!=0) range4=range4[-which(is.na(y[range4]))] + + mu1 = weighted.mean(x[range1],y[range1]) + sigma1 = getSD(x[range1],y[range1]) + fitP = fitG_2(x[range1],y[range1],sigma1,mu1,scale,useBounds) + p = fitP$par + + mu2 = weighted.mean(x[range2],y[range2]) + sigma2 = getSD(x[range2],y[range2]) + fitP = fitG_2(x[range2],y[range2],sigma2,mu2,scale,useBounds) + p2 = fitP$par + + mu3 = weighted.mean(x[range3],y[range3]) + sigma3 = getSD(x[range3],y[range3]) + fitP = fitG_2(x[range3],y[range3],sigma3,mu3,scale,useBounds) + p3 = fitP$par + + mu4 = weighted.mean(x[range4],y[range4]) + sigma4 = getSD(x[range4],y[range4]) + fitP = fitG_2(x[range4],y[range4],sigma4,mu4,scale,useBounds) + p4 = fitP$par + + fit4P = fit4G_2(x, y, sigma1, sigma2, sigma3, sigma3, p[1], p[2], p2[1], p2[2], p3[1], p3[2], p4[1], p4[2], useBounds) + p5 = fit4P$par + + # plot ##################################### + sumFit2 = (p5[2]*dnorm(x2,p5[1],sigma1))+(p5[4]*dnorm(x2,p5[3],sigma2))+(p5[6]*dnorm(x2,p5[5],sigma3))+(p5[8]*dnorm(x2,p5[7],sigma3)) + sumFit = (p5[2]*dnorm(x,p5[1],sigma1))+(p5[4]*dnorm(x,p5[3],sigma2))+(p5[6]*dnorm(x,p5[5],sigma3))+(p5[8]*dnorm(x,p5[7],sigma3)) + fq=getFitQuality(x,y,sort(c(p5[1],p5[3],p5[5],p5[7]))[1],sort(c(p5[1],p5[3],p5[5],p5[7]))[4],resol,sumFit=sumFit)$fq_new + + if (plot & (fq < FQ)) lines(x2,p5[2]*dnorm(x2,p5[1],sigma1),col="purple") + if (plot & (fq < FQ)) abline(v = p5[1], col="purple") + fwhm = getFwhm(p5[1],resol) + half_max = max(p5[2]*dnorm(x2,p5[1],sigma1))*0.5 + if (plot & (fq < FQ)) lines(c(p5[1] - 0.5*fwhm, p5[1] + 0.5*fwhm),c(half_max,half_max),col="orange") + + if (plot & (fq < FQ)) lines(x2,p5[4]*dnorm(x2,p5[3],sigma2),col="purple") + if (plot & (fq < FQ)) abline(v = p5[3], col="purple") + fwhm = getFwhm(p5[3],resol) + half_max = max(p5[4]*dnorm(x2,p5[3],sigma2))*0.5 + if (plot & (fq < FQ)) lines(c(p5[3] - 0.5*fwhm, p5[3] + 0.5*fwhm),c(half_max,half_max),col="orange") + + if (plot & (fq < FQ)) lines(x2,p5[6]*dnorm(x2,p5[5],sigma3),col="purple") + if (plot & (fq < FQ)) abline(v = p5[5], col="purple") + fwhm = getFwhm(p5[5],resol) + half_max = max(p5[6]*dnorm(x2,p5[5],sigma3))*0.5 + if (plot & (fq < FQ)) lines(c(p5[5] - 0.5*fwhm, p5[5] + 0.5*fwhm),c(half_max,half_max),col="orange") + + if (plot & (fq < FQ)) lines(x2,p5[8]*dnorm(x2,p5[7],sigma3),col="purple") + if (plot & (fq < FQ)) abline(v = p5[7], col="purple") + fwhm = getFwhm(p5[7],resol) + half_max = max(p5[8]*dnorm(x2,p5[7],sigma3))*0.5 + if (plot & (fq < FQ)) lines(c(p5[7] - 0.5*fwhm, p5[7] + 0.5*fwhm),c(half_max,half_max),col="orange") + + if (plot & (fq < FQ)) lines(x2,sumFit2,col="blue") + + #fq = abs(sum(y) - sum(sumFit))/sum(y) + #fq=abs(sum(y) - sum(sumFit))/sum(y) + #fq=mean(abs(sumFit - y)/sumFit) + + + h2=c(paste("mean =", p5[1], sep=" "), + paste("mean =", p5[3], sep=" "), + paste("mean =", p5[5], sep=" "), + paste("mean =", p5[7], sep=" "), + paste("fq =", fq, sep=" ")) + + if (plot & (fq < FQ)) legend("topright", legend=h2) + ############################################# + + # area1 = sum(p5[2]*dnorm(x2,p5[1],sigma1)) + # area2 = sum(p5[4]*dnorm(x2,p5[3],sigma2)) + # area3 = sum(p5[6]*dnorm(x2,p5[5],sigma3)) + # area4 = sum(p5[8]*dnorm(x2,p5[7],sigma4)) + + # area1 = max(p5[2]*dnorm(x2,p5[1],sigma1)) + # area2 = max(p5[4]*dnorm(x2,p5[3],sigma2)) + # area3 = max(p5[6]*dnorm(x2,p5[5],sigma3)) + # area4 = max(p5[8]*dnorm(x2,p5[7],sigma4)) + + area1 = getArea(p5[1],resol,p5[2],sigma1,int.factor) + area2 = getArea(p5[3],resol,p5[4],sigma2,int.factor) + area3 = getArea(p5[5],resol,p5[6],sigma3,int.factor) + area4 = getArea(p5[7],resol,p5[8],sigma4,int.factor) + + peak.area = c(peak.area, area1) + peak.area = c(peak.area, area2) + peak.area = c(peak.area, area3) + peak.area = c(peak.area, area4) + + peak.mean = c(peak.mean, p5[1]) + peak.mean = c(peak.mean, p5[3]) + peak.mean = c(peak.mean, p5[5]) + peak.mean = c(peak.mean, p5[7]) + + peak.scale = c(peak.scale, p5[2]) + peak.scale = c(peak.scale, p5[4]) + peak.scale = c(peak.scale, p5[6]) + peak.scale = c(peak.scale, p5[8]) + + peak.sigma = c(peak.sigma, sigma1) + peak.sigma = c(peak.sigma, sigma2) + peak.sigma = c(peak.sigma, sigma3) + peak.sigma = c(peak.sigma, sigma4) + + return(list("mean"=peak.mean, "scale"=peak.scale, "sigma"=peak.sigma, "area"=peak.area, "qual"=fq)) + +} \ No newline at end of file diff --git a/DIMS/AddOnFunctions/fitG.R b/DIMS/AddOnFunctions/fitG.R new file mode 100644 index 0000000..b1b595d --- /dev/null +++ b/DIMS/AddOnFunctions/fitG.R @@ -0,0 +1,18 @@ +fitG_2 <- function(x,y,sig,mu,scale,useBounds) { + + f = function(p) { + d = p[2]*dnorm(x,mean=p[1],sd=sig) + sum((d-y)^2) + } + + if (useBounds){ + lower = c(x[1],0,x[1],0) + upper = c(x[length(x)],Inf,x[length(x)],Inf) + + optim(c(as.numeric(mu), as.numeric(scale)), + f,control=list(maxit=10000),method="L-BFGS-B",lower=lower,upper=upper) + } else { + #optim(c(mu,scale),f) + optim(c(as.numeric(mu),as.numeric(scale)),f,control=list(maxit=10000)) + } +} diff --git a/DIMS/AddOnFunctions/fitGaussian.R b/DIMS/AddOnFunctions/fitGaussian.R new file mode 100644 index 0000000..3c5027d --- /dev/null +++ b/DIMS/AddOnFunctions/fitGaussian.R @@ -0,0 +1,327 @@ +fitGaussian <- function(x2,x,y,index,scale,resol,outdir,force,useBounds,plot,scanmode,int.factor,width,height) { + # force=length(index) + # useBounds=FALSE + + peak.mean = NULL + peak.area = NULL + peak.qual = NULL + peak.min = NULL + peak.max = NULL + + FQ1 = 0.15 + FQ = 0.2 + + # One local max + if (force==1){ + + retVal = fit1Peak(x2,x,y,index,scale,resol,plot,FQ1,useBounds) + + scale = 2 + + if (retVal$mean[1]x[length(x)]) { # <=== mean outside range + + # do it again + return(fitGaussian(x2,x,y,index,scale,resol,outdir,force=1,useBounds=TRUE,plot,scanmode,int.factor,width,height)) + + } else { # <=== mean within range + + if (retVal$qual > FQ1){ # <=== bad fit + + # Try to fit two curves + + # diff(diff(x)) essentially computes the discrete analogue of the second derivative, so should be negative at local maxima. + # The +1 below takes care of the fact that the result of diff is shorter than the input vector. + new_index=which(diff(sign(diff(y)))==-2)+1 + + if (length(new_index)!=2) { + new_index = round(length(x)/3) + new_index = c(new_index, 2*new_index) + } + + #length(new_index) + return(fitGaussian(x2,x,y,new_index,scale,resol,outdir,force=2,useBounds=FALSE,plot,scanmode,int.factor,width,height)) + + } else { # <=== good fit + + peak.mean = c(peak.mean, retVal$mean) + #peak.area = c(peak.area, sum(retVal$scale*dnorm(x2,retVal$mean,retVal$sigma))) + # "Centroid mode" + # peak.area = c(peak.area, max(retVal$scale*dnorm(x2,retVal$mean,retVal$sigma))) + peak.area = c(peak.area, getArea(retVal$mean,resol,retVal$scale,retVal$sigma,int.factor)) + peak.qual = retVal$qual + peak.min = x[1] + peak.max = x[length(x)] + } + } + + # Two local max + } else if (force==2 & (length(x)>6)) { + + # fit two curves + retVal = fit2peaks(x2,x,y,index,scale,resol,useBounds,plot,FQ,int.factor) # <=== fit 2 curves + + if (retVal$mean[1]x[length(x)] | # <=== one of means outside range + retVal$mean[2]x[length(x)]) { + + # check quality + if (retVal$qual > FQ) { # <=== bad fit + + # do it again + return(fitGaussian(x2,x,y,index,scale,resol,outdir,force=2,useBounds=TRUE,plot,scanmode,int.factor,width,height)) + + # good fit + } else { + + # check which mean is outside range + # Todo ========> check this!!!!!!!!!!!!!!! + for (i in 1:length(retVal$mean)){ + if (retVal$mean[i]x[length(x)] ) { + peak.mean = c(peak.mean, -i) + peak.area = c(peak.area, -i) + } else { + peak.mean = c(peak.mean, retVal$mean[i]) + peak.area = c(peak.area, retVal$area[i]) + } + } + peak.qual = retVal$qual + peak.min = x[1] + peak.max = x[length(x)] + } + } else { # <=== all means within range + + if (retVal$qual > FQ) { # <=== bad fit + + # Try to fit three curves + new_index=which(diff(sign(diff(y)))==-2)+1 + + if (length(new_index)!=3) { + new_index = round(length(x)/4) + new_index = c(new_index, 2*new_index, 3*new_index) + } + + #length(new_index) + return(fitGaussian(x2,x,y,new_index,scale,resol,outdir,force=3,useBounds=FALSE,plot,scanmode,int.factor,width,height)) + + } else { # <======== good fit + + # check if means are within 3 ppm and sum if so + tmp = retVal$qual + + nMeanNew = -1 + nMean = length(retVal$mean) + while (nMean!=nMeanNew){ + nMean = length(retVal$mean) + retVal = isWithinXppm(retVal$mean, retVal$scale, retVal$sigma, retVal$area, x2, x, ppm=4, resol, plot) + nMeanNew = length(retVal$mean) + } + + retVal$qual = tmp + + h2=NULL + + for (i in 1:length(retVal$mean)){ + h2 = c(h2, paste("mean =", retVal$mean[i], sep=" ")) + + peak.mean = c(peak.mean, retVal$mean[i]) + peak.area = c(peak.area, retVal$area[i]) + } + peak.qual = retVal$qual + peak.min = x[1] + peak.max = x[length(x)] + + h2 = c(h2, paste("fq =", retVal$qual, sep=" ")) + if (plot) legend("topright", legend=h2) + } + } + + # Three local max + } else if (force==3 & (length(x)>6)){ + + retVal = fit3peaks(x2,x,y,index,scale,resol,useBounds,plot,FQ,int.factor) + + # outside range + if (retVal$mean[1]x[length(x)] | # <=== one of means outside range + retVal$mean[2]x[length(x)] | + retVal$mean[3]x[length(x)]) { + + # check quality + if (retVal$qual > FQ) { # <=== bad fit + + # do it again + return(fitGaussian(x2,x,y,index,scale,resol,outdir,force,useBounds=TRUE,plot,scanmode,int.factor,width,height)) + + # good fit + } else { + + # check which mean is outside range + # Todo ========> check this!!!!!!!!!!!!!!! + for (i in 1:length(retVal$mean)){ + if (retVal$mean[i]x[length(x)] ) { + peak.mean = c(peak.mean, -i) + peak.area = c(peak.area, -i) + } else { + peak.mean = c(peak.mean, retVal$mean[i]) + peak.area = c(peak.area, retVal$area[i]) + } + } + peak.qual = retVal$qual + peak.min = x[1] + peak.max = x[length(x)] + } + + } else { # <=== all means within range + + if (retVal$qual > FQ) { # <=== bad fit + + # Try to fit four curves + new_index=which(diff(sign(diff(y)))==-2)+1 + + if (length(new_index)!=4) { + new_index = round(length(x)/5) + new_index = c(new_index, 2*new_index, 3*new_index, 4*new_index) + } + + #length(new_index) + return(fitGaussian(x2,x,y,new_index,scale,resol,outdir,force=4,useBounds=FALSE,plot,scanmode,int.factor,width,height)) + + } else { # <======== good fit + + # check if means are within 3 ppm and sum if so + tmp = retVal$qual + + nMeanNew = -1 + nMean = length(retVal$mean) + while (nMean!=nMeanNew){ + nMean = length(retVal$mean) + retVal = isWithinXppm(retVal$mean, retVal$scale, retVal$sigma, retVal$area, x2, x, ppm=4, resol, plot) + nMeanNew = length(retVal$mean) + } + + retVal$qual = tmp + + h2=NULL + # peak.mean=NULL + # peak.area=NULL + + for (i in 1:length(retVal$mean)){ + h2 = c(h2, paste("mean =", retVal$mean[i], sep=" ")) + + peak.mean = c(peak.mean, retVal$mean[i]) + peak.area = c(peak.area, retVal$area[i]) + } + peak.qual = retVal$qual + peak.min = x[1] + peak.max = x[length(x)] + + h2 = c(h2, paste("fq =", retVal$qual, sep=" ")) + if (plot) legend("topright", legend=h2) + + } + } + + # Four local max + } else if (force==4 & (length(x)>6)){ + + retVal = fit4peaks(x2,x,y,index,scale,resol,useBounds,plot,FQ,int.factor) + + if (retVal$mean[1]x[length(x)] | + retVal$mean[2]x[length(x)] | + retVal$mean[3]x[length(x)] | + retVal$mean[4]x[length(x)]) { + + # check quality + if (retVal$qual > FQ) { # <=== bad fit + + # do it again + return(fitGaussian(x2,x,y,index,scale,resol,outdir,force,useBounds=TRUE,plot,scanmode,int.factor,width,height)) + + # good fit + } else { + + # check which mean is outside range + # Todo ========> check this!!!!!!!!!!!!!!! + for (i in 1:length(retVal$mean)){ + if (retVal$mean[i]x[length(x)] ) { + peak.mean = c(peak.mean, -i) + peak.area = c(peak.area, -i) + } else { + peak.mean = c(peak.mean, retVal$mean[i]) + peak.area = c(peak.area, retVal$area[i]) + } + } + peak.qual = retVal$qual + peak.min = x[1] + peak.max = x[length(x)] + } + + } else { + + if (retVal$qual > FQ) { # <=== bad fit + + # Generate 1 curve + return(fitGaussian(x2,x,y,index,scale,resol,outdir,force=5,useBounds=FALSE,plot,scanmode,int.factor,width,height)) + + } else { # <======== good fit + + # check if means are within 3 ppm and sum if so + tmp = retVal$qual + + nMeanNew = -1 + nMean = length(retVal$mean) + while (nMean!=nMeanNew){ + nMean = length(retVal$mean) + retVal = isWithinXppm(retVal$mean, retVal$scale, retVal$sigma, retVal$area, x2, x, ppm=4, resol, plot) + nMeanNew = length(retVal$mean) + } + + retVal$qual = tmp + + h2=NULL + + for (i in 1:length(retVal$mean)){ + h2 = c(h2, paste("mean =", retVal$mean[i], sep=" ")) + + peak.mean = c(peak.mean, retVal$mean[i]) + peak.area = c(peak.area, retVal$area[i]) + } + peak.qual = retVal$qual + peak.min = x[1] + peak.max = x[length(x)] + + h2 = c(h2, paste("fq =", retVal$qual, sep=" ")) + if (plot) legend("topright", legend=h2) + } + } + + # More then four local max + } else { + + scale=2 + FQ1=0.40 + useBounds=TRUE + index=which(y==max(y)) + retVal = fit1Peak(x2,x,y,index,scale,resol,plot,FQ1,useBounds) + + if (retVal$qual > FQ1){ # <=== bad fit + + if (plot) dev.off() + + rval = generateGaussian(x,y,resol,plot,scanmode,int.factor, width, height) + peak.mean = c(peak.mean, rval$mean) + peak.area = c(peak.area, rval$area) + peak.min = rval$min + peak.max = rval$max + peak.qual = 0 + + } else { # <=== good fit + + peak.mean = c(peak.mean, retVal$mean) + peak.area = c(peak.area, getArea(retVal$mean,resol,retVal$scale,retVal$sigma,int.factor)) + peak.qual = retVal$qual + peak.min = x[1] + peak.max = x[length(x)] + } + } + + return(list("mean"=peak.mean, "area"=peak.area, "qual" = peak.qual, "min"=peak.min, "max"=peak.max)) +} diff --git a/DIMS/AddOnFunctions/fitGaussianInit.R b/DIMS/AddOnFunctions/fitGaussianInit.R new file mode 100644 index 0000000..1c92bce --- /dev/null +++ b/DIMS/AddOnFunctions/fitGaussianInit.R @@ -0,0 +1,60 @@ +fitGaussianInit <- function(x,y,int.factor,scale,resol,outdir,sampname,scanmode,plot,width,height,i,start,end) { + # scanmode="negative" + + mz.range = x[length(x)] - x[1] + x2 = seq(x[1],x[length(x)],length=mz.range*int.factor) + + # # diff(diff(x)) essentially computes the discrete analogue of the second derivative, so should be negative at local maxima. + # # The +1 below takes care of the fact that the result of diff is shorter than the input vector. + # index=which(diff(sign(diff(y)))==-2)+1 + + # Alway try to fit one curve first + index = which(y==max(y)) + + if (scanmode=="positive"){ + plot_label="pos_fit.png" + } else { + plot_label="neg_fit.png" + } + + if (plot) { + CairoPNG(filename=paste(outdir,"Gaussian_fit",paste(sampname, x[1], plot_label, sep="_"), sep="/"), width, height) + plot(x,y,xlab="m/z",ylab="I", ylim=c(0,1.5*max(y)),main=paste(i,start,end, sep=" ")) #, ylim=c(0,1e+05) + } + + retVal = fitGaussian(x2,x,y,index,scale,resol,outdir,force=length(index),useBounds=FALSE,plot,scanmode,int.factor,width,height) + + if (plot) { + if (length(retVal$mean)==1) { + + result = tryCatch(dev.off(), warning = function(w){}, + error=function(e){}, + finally = {}) + + # file.rename(paste(outdir,"Gaussian_fit", paste(sampname, x[1], plot_label, sep="_"), sep="/"), + # paste(outdir,"Gaussian_fit", paste(sampname, round(retVal$mean,digits = 6), plot_label, sep="_"), sep="/")) + } else { + + + h2=NULL + for (i in 1:length(retVal$mean)){ + h2=c(h2, paste("mean =", retVal$mean[i], sep=" ")) + } + h2=c(h2, paste("fq =", retVal$qual, sep=" ")) + legend("topright", legend=h2) + + dev.off() + + for (i in 1:length(retVal$mean)){ + if (retVal$mean[i]!=-1){ + file.copy(paste(outdir,"Gaussian_fit", paste(sampname, x[1], plot_label, sep="_"), sep="/"), + paste(outdir,"Gaussian_fit", paste(sampname, round(retVal$mean[i],digits = 6), plot_label, sep="_"), sep="/")) + } + } + file.remove(paste(outdir,"Gaussian_fit", paste(sampname, x[1], plot_label, sep="_"), sep="/")) + } + } + + return(list("mean"=retVal$mean, "area"=retVal$area, "qual"=retVal$qual, "min"=retVal$min, "max"=retVal$max)) + +} diff --git a/DIMS/AddOnFunctions/generateBreaksFwhm.R b/DIMS/AddOnFunctions/generateBreaksFwhm.R new file mode 100644 index 0000000..1a42631 --- /dev/null +++ b/DIMS/AddOnFunctions/generateBreaksFwhm.R @@ -0,0 +1,26 @@ +# options(digits=16) +# resol = 140000 +# breaks.fwhm = 1 +# i = 1 +# breaks.fwhm.avg = NULL +# +# Sys.time() +# +# while (breaks.fwhm[length(breaks.fwhm)]<1000){ +# +# resol.mz = resol*(1/sqrt(2)^(log2(breaks.fwhm[i]/200))) +# fwhm.0.1 = (breaks.fwhm[i]/resol.mz)/10 +# breaks.fwhm = c(breaks.fwhm, breaks.fwhm[i] + fwhm.0.1) +# breaks.fwhm.avg = c(breaks.fwhm.avg,(breaks.fwhm[i] +breaks.fwhm[i+1])/2) +# +# if (i %% 10000 == 0){ +# cat(paste("i =", i)) +# cat(paste("breaks.fwhm =", breaks.fwhm[length(breaks.fwhm)])) +# } +# +# i = i + 1 +# } +# +# Sys.time() +# save(list=c("breaks.fwhm", "breaks.fwhm.avg"), file="breaks.RData") +# cat("Breaks saved!") diff --git a/DIMS/AddOnFunctions/generateExcelFile.R b/DIMS/AddOnFunctions/generateExcelFile.R new file mode 100644 index 0000000..31474d9 --- /dev/null +++ b/DIMS/AddOnFunctions/generateExcelFile.R @@ -0,0 +1,24 @@ +generateExcelFile <- function(peaklist, fileName, sub, plot = TRUE) { + # plotdir=file.path(plotdir) + # imageNum=2 + # fileName="./results/xls/Pos_allpgrps_identified" + # subName=c("","_box") + + end <- 0 + i <- 0 + + if (dim(peaklist)[1]>=sub & (sub>0)){ + for (i in 1:floor(dim(peaklist)[1]/sub)){ + start=-(sub-1)+i*sub + end=i*sub + message(paste0(start, ":", end)) + + genExcelFileV3(peaklist[c(start:end),], paste(fileName, sep="_"), plot) + } + } + start = end + 1 + end = dim(peaklist)[1] + message(start) + message(end) + genExcelFileV3(peaklist[c(start:end),], paste(fileName, i+1, sep="_"), plot) +} diff --git a/DIMS/AddOnFunctions/generateGaussian.R b/DIMS/AddOnFunctions/generateGaussian.R new file mode 100644 index 0000000..da6dec2 --- /dev/null +++ b/DIMS/AddOnFunctions/generateGaussian.R @@ -0,0 +1,48 @@ +generateGaussian <- function(x,y,resol,plot,scanmode,int.factor,width,height) { + + factor=1.5 + index = which(y==max(y))[1] + x=x[index] + y=y[index] + mu = x + fwhm = getFwhm(mu,resol) + x.p = c(mu-factor*fwhm, x, mu+factor*fwhm) + y.p = c(0, y, 0) + + # if (plot) dir.create("./results/plots",showWarnings = FALSE) + # if (plot) dir.create("./results/plots/Gaussian_fit",showWarnings = FALSE) + + if (plot) { + if (scanmode=="positive"){ + plot_label="pos_fit.png" + } else { + plot_label="neg_fit.png" + } + } + + mz.range = x.p[length(x.p)] - x.p[1] + x2 = seq(x.p[1],x.p[length(x.p)],length=mz.range*int.factor) + sigma = getSD(x.p,y.p) + scale = optimizeGauss(x.p,y.p,sigma,mu) + + if (plot) { + CairoPNG(filename=paste("./results/Gaussian_fit",paste(sampname, mu, plot_label, sep="_"), sep="/"), width, height) + + plot(x.p,y.p,xlab="m/z",ylab="I", ylim=c(0,1.5*max(y))) + lines(x2,scale*dnorm(x2,mu,sigma), col="green") + half_max = max(scale*dnorm(x2,mu,sigma))*0.5 + lines(c(mu - 0.5*fwhm, mu + 0.5*fwhm),c(half_max,half_max),col="orange") + abline(v = mu, col="green") + h=c(paste("mean =", mu, sep=" ")) + legend("topright", legend=h) + + dev.off() + } + + # area = sum(scale*dnorm(x2,mu,sigma)) + # area = max(scale*dnorm(x2,mu,sigma)) + area = getArea(mu,resol,scale,sigma,int.factor) + + return(list("mean"=mu, "area"=area, "min"=x2[1] , "max"=x2[length(x2)])) + +} \ No newline at end of file diff --git a/DIMS/AddOnFunctions/getArea.R b/DIMS/AddOnFunctions/getArea.R new file mode 100644 index 0000000..415b050 --- /dev/null +++ b/DIMS/AddOnFunctions/getArea.R @@ -0,0 +1,23 @@ +getArea <- function(mu,resol,scale,sigma,int.factor){ + + # mu=p3[1] + # scale=p3[2] + # sigma=sigma1 + + # mu=p3[1] + # scale=p3[2] + # sigma=sigma1 + + # avoid too big vector (cannot allocate vector of size ...) + if (mu>1200) return(0) + + fwhm = getFwhm(mu,resol) + mzMin = mu - 2*fwhm + mzMax = mu + 2*fwhm + mz.range = mzMax - mzMin + x_int = seq(mzMin,mzMax,length=mz.range*int.factor) + + #plot(x_int,scale*dnorm(x_int,mu,sigma),col="red",type="l") + + return(sum(scale*dnorm(x_int,mu,sigma))/100) +} diff --git a/DIMS/AddOnFunctions/getDeltaMZ.R b/DIMS/AddOnFunctions/getDeltaMZ.R new file mode 100644 index 0000000..f09a0a4 --- /dev/null +++ b/DIMS/AddOnFunctions/getDeltaMZ.R @@ -0,0 +1,5 @@ +getDeltaMZ <- function(mass, breaks.fwhm){ + index = which(breaks.fwhmintRangeMin) & (xintRangeMin) & (x mu: ",mu)) + # message(paste("====================> resol.mz: ",resol.mz)) + # }) + + # , error = function(e) { + # error-handler-code + # }, finally = { + # cleanup-code + # }) + + return(fwhm) +} diff --git a/DIMS/AddOnFunctions/getPatients.R b/DIMS/AddOnFunctions/getPatients.R new file mode 100644 index 0000000..3b03f0a --- /dev/null +++ b/DIMS/AddOnFunctions/getPatients.R @@ -0,0 +1,9 @@ +getPatients <- function(peaklist){ + patients=colnames(peaklist)[grep("P", colnames(peaklist), fixed = TRUE)] + patients=unique(as.vector(unlist(lapply(strsplit(patients, ".", fixed = TRUE), function(x) x[1])))) + # ToDo: If 2 P's in sample names!!!!!!!!!!!!! + # patients=sort(as.numeric(unique(as.vector(unlist(lapply(strsplit(patients, "_P", fixed = TRUE), function(x) x[2])))))) + patients=sort(as.numeric(unique(as.vector(unlist(lapply(strsplit(patients, "P", fixed = TRUE), function(x) x[2])))))) + + return(patients) +} diff --git a/DIMS/AddOnFunctions/getSD.R b/DIMS/AddOnFunctions/getSD.R new file mode 100644 index 0000000..197a002 --- /dev/null +++ b/DIMS/AddOnFunctions/getSD.R @@ -0,0 +1,9 @@ +getSD <- function(x,y,resol=140000) { + + index = which(y==max(y)) + mean = x[index] + resol.mz = resol*(1/sqrt(2)^(log2(mean/200))) + fwhm = mean/resol.mz + sd = (fwhm/2)*0.85 + return(sd) +} diff --git a/DIMS/AddOnFunctions/get_patient_data_to_helix.R b/DIMS/AddOnFunctions/get_patient_data_to_helix.R new file mode 100644 index 0000000..c582177 --- /dev/null +++ b/DIMS/AddOnFunctions/get_patient_data_to_helix.R @@ -0,0 +1,31 @@ +get_patient_data_to_helix <- function(metab_interest_sorted, metab_list_all){ + # Combine Z-scores of metab groups together + df_all_metabs_zscores <- bind_rows(metab_interest_sorted) + # Change columnnames + colnames(df_all_metabs_zscores) <- c("HMDB_name", "Patient", "Z_score") + # Change Patient column to character instead of factor + df_all_metabs_zscores$Patient <- as.character(df_all_metabs_zscores$Patient) + + # Delete whitespaces HMDB_name + df_all_metabs_zscores$HMDB_name <- str_trim(df_all_metabs_zscores$HMDB_name, "right") + + # Split HMDB_name column on "nitine;" for match dims_helix_table + df_all_metabs_zscores$HMDB_name_split <- str_split_fixed(df_all_metabs_zscores$HMDB_name, "nitine;", 2)[,1] + + # Combine stofgroepen + dims_helix_table <- bind_rows(metab_list_all) + # Filter table for metabolites for Helix + dims_helix_table <- dims_helix_table %>% filter(Helix == "ja") + # Split HMDB_name column on "nitine;" for match df_all_metabs_zscores + dims_helix_table$HMDB_name_split <- str_split_fixed(dims_helix_table$HMDB_name, "nitine;", 2)[,1] + dims_helix_table <- dims_helix_table %>% select(HMDB_name_split, Helix_naam, high_zscore, low_zscore) + + # Filter DIMS results for metabolites for Helix + df_metabs_helix <- df_all_metabs_zscores %>% filter(HMDB_name_split %in% dims_helix_table$HMDB_name_split) + # Combine dims_helix_table and df_metabs_helix, adding Helix codes etc. + df_metabs_helix <- df_metabs_helix %>% left_join(dims_helix_table, by = join_by(HMDB_name_split)) + + df_metabs_helix <- df_metabs_helix %>% select(HMDB_name, Patient, Z_score, Helix_naam, high_zscore, low_zscore) + + return(df_metabs_helix) +} \ No newline at end of file diff --git a/DIMS/AddOnFunctions/globalAssignments.HPC.R b/DIMS/AddOnFunctions/globalAssignments.HPC.R new file mode 100644 index 0000000..9207661 --- /dev/null +++ b/DIMS/AddOnFunctions/globalAssignments.HPC.R @@ -0,0 +1,114 @@ +#.libPaths(new="/hpc/local/CentOS7/dbg_mz/R_libs/3.2.2") +## calculate relative abundancies from theoretical mass and composition + +#library(Rdisop) + +#theor.table <- read.table(file="C:/Users/mraves/Metabolomics/TheoreticalMZ_Negative_composition.txt", sep="\t", header=TRUE) +options(digits=16) + +#library(OrgMassSpecR) + +suppressPackageStartupMessages(library(lattice)) +# The following list was copied from Rdisop elements.R and corrected for C, H, O, Cl, S according to NIST +Ba <- list(name= 'Ba', mass=130, isotope=list(mass=c(-0.093718, 0, -0.094958, 0, -0.095514, -0.094335, -0.095447, -0.094188, -0.094768), abundance=c(0.00106, 0, 0.00101, 0, 0.02417, 0.06592, 0.07854, 0.1123, 0.717))) +Br <- list(name= 'Br', mass=79, isotope=list(mass=c(-0.0816639, 0, -0.083711), abundance=c(0.5069, 0, 0.4931))) +C <- list(name= 'C', mass=12, isotope=list(mass=c(0, 0.003354838, 0.003241989), abundance=c(0.9893, 0.0107, 0))) +Ca <- list(name= 'Ca', mass=40, isotope=list(mass=c(-0.0374094, 0, -0.0413824, -0.0412338, -0.0445194, 0, -0.046311, 0, -0.047467), abundance=c(0.96941, 0, 0.00647, 0.00135, 0.02086, 0, 4e-05, 0, 0.00187))) +Cl <- list(name= 'Cl', mass=35, isotope=list(mass=c(-0.03114732, 0, -0.03409741), abundance=c(0.7576, 0, 0.2424))) +Cr <- list(name= 'Cr', mass=50, isotope=list(mass=c(-0.0539536, 0, -0.0594902, -0.0593487, -0.0611175), abundance=c(0.04345, 0, 0.83789, 0.09501, 0.02365))) +Cu <- list(name= 'Cu', mass=63, isotope=list(mass=c(-0.0704011, 0, -0.0722071), abundance=c(0.6917, 0, 0.3083))) +F <- list(name= 'F', mass=19, isotope=list(mass=c(-0.00159678), abundance=c(1))) +Fe <- list(name= 'Fe', mass=54, isotope=list(mass=c(-0.0603873, 0, -0.0650607, -0.0646042, -0.0667227), abundance=c(0.058, 0, 0.9172, 0.022, 0.0028))) +H <- list(name= 'H', mass=1, isotope=list(mass=c(0.00782503207, 0.014101778, 0.01604928), abundance=c(0.999885, 0.000115, 0))) +Hg <- list(name= 'Hg', mass=196, isotope=list(mass=c(-0.034193, 0, -0.033257, -0.031746, -0.0317, -0.029723, -0.029383, 0, -0.026533), abundance=c(0.0015, 0, 0.0997, 0.1687, 0.231, 0.1318, 0.2986, 0, 0.0687))) +I <- list(name= 'I', mass=127, isotope=list(mass=c(-0.095527), abundance=c(1))) +K <- list(name= 'K', mass=39, isotope=list(mass=c(-0.0362926, -0.0360008, -0.0381746), abundance=c(0.932581, 0.000117, 0.067302))) +Li <- list(name= 'Li', mass=6, isotope=list(mass=c(0.0151214, 0.016003), abundance=c(0.075, 0.925))) +Mg <- list(name= 'Mg', mass=24, isotope=list(mass=c(-0.0149577, -0.0141626, -0.0174063), abundance=c(0.7899, 0.1, 0.1101))) +Mn <- list(name= 'Mn', mass=55, isotope=list(mass=c(-0.0619529), abundance=c(1))) +N <- list(name= 'N', mass=14, isotope=list(mass=c(0.003074002, 0.00010897), abundance=c(0.99634, 0.00366))) +Na <- list(name= 'Na', mass=23, isotope=list(mass=c(-0.0102323), abundance=c(1))) +Ni <- list(name= 'Ni', mass=58, isotope=list(mass=c(-0.0646538, 0, -0.0692116, -0.0689421, -0.0716539, 0, -0.0720321), abundance=c(0.68077, 0, 0.26223, 0.0114, 0.03634, 0, 0.00926))) +O <- list(name= 'O', mass=16, isotope=list(mass=c(-0.00508538044, -0.0008683, -0.0008397), abundance=c(0.99757, 0.000381, 0.00205))) +P <- list(name= 'P', mass=31, isotope=list(mass=c(-0.026238), abundance=c(1))) +S <- list(name= 'S', mass=32, isotope=list(mass=c(-0.027929, -0.02854124, -0.0321331, 0, -0.03291924), abundance=c(0.9499, 0.0075, 0.0425, 0, 1e-04))) +Se <- list(name= 'Se', mass=74, isotope=list(mass=c(-0.0775254, 0, -0.080788, -0.0800875, -0.0826924, 0, -0.0834804, 0, -0.0833022), abundance=c(0.0089, 0, 0.0936, 0.0763, 0.2378, 0, 0.4961, 0, 0.0873))) +Si <- list(name= 'Si', mass=28, isotope=list(mass=c(-0.0230729, -0.0235051, -0.0262293), abundance=c(0.9223, 0.0467, 0.031))) +Sn <- list(name= 'Sn', mass=112, isotope=list(mass=c(-0.095174, 0, -0.097216, -0.096652, -0.098253, -0.097044, -0.098391, -0.09669, -0.0978009, 0, -0.0965596, 0, -0.0947257),abundance=c(0.0097, 0, 0.0065, 0.0034, 0.1453, 0.0768, 0.2423, 0.0859, 0.3259, 0, 0.0463, 0, 0.0579))) +Zn <- list(name= 'Zn', mass=64, isotope=list(mass=c(-0.0708552, 0, -0.0739653, -0.0728709, -0.0751541, 0, -0.074675), abundance=c(0.486, 0, 0.279, 0.041, 0.188, 0, 0.006))) + +NH4 <- list(name= "NH4", mass=18, isotope=list(mass=c(0.03437, 0.03141, -0.95935)), abundance=c(0.995, 0.004, 0.001)) # SISweb: 18.03437 100 19.03141 0.4 19.04065 0.1 +Ac <- list(name= "Ac", mass=60, isotope=list(mass=c(0.02114, 0.02450, 0.02538)), abundance=c(0.975, 0.021, 0.004)) # SISweb: 60.02114 100 61.02450 2.2 62.02538 0.4 +NaCl <- list(name= "NaCl", mass=58, isotope=list(mass=c(-0.04137, 0, -0.04433)), abundance=c(0.755, 0, 0.245)) # SISweb: 57.95862 100 59.95567 32.4 +NaCl2 <- list(name= "NaCl2", mass=116, isotope=list(mass=c(-0.08274, 0, -0.08866)), abundance=c(0.755, 0, 0.245)) # SISweb: 57.95862 100 59.95567 32.4 +NaCl3 <- list(name= "NaCl3", mass=174, isotope=list(mass=c(-0.12411, 0, -0.13299)), abundance=c(0.755, 0, 0.245)) # SISweb: 57.95862 100 59.95567 32.4 +NaCl4 <- list(name= "NaCl4", mass=232, isotope=list(mass=c(-0.16548, 0, -0.17732)), abundance=c(0.755, 0, 0.245)) # SISweb: 57.95862 100 59.95567 32.4 +NaCl5 <- list(name= "NaCl5", mass=290, isotope=list(mass=c(-0.20685, 0, -0.22165)), abundance=c(0.755, 0, 0.245)) # SISweb: 57.95862 100 59.95567 32.4 +For <- list(name= "For", mass=45, isotope=list(mass=c(-0.00233, 0.00103)), abundance=c(0.989, 0.011)) # SISweb: 46.00549 100 47.00885 1.1 (47.0097 0.1) 48.00973 0.4 +Na2 <- list(name= "2Na-H", mass=46, isotope=list(mass=c(-1.0282896)), abundance=c(1)) # SISweb for Na2: 45.97954 100 # minus 1 H ! +Met <- list(name= "CH3OH", mass=32, isotope=list(mass=c(1.034045,1.037405)), abundance=c(0.989,0.011)) # SISweb: 32.02622 100 33.02958 1.1 33.0325 0.1 34.03046 0.2 +CH3OH <- list(name= "CH3OH", mass=32, isotope=list(mass=c(1.034045,1.037405)), abundance=c(0.989,0.011)) # SISweb: 32.02622 100 33.02958 1.1 33.0325 0.1 34.03046 0.2 +Na3 <- list(name= "3Na-2H", mass=69, isotope=list(mass=c(-2.0463469)), abundance=c(1)) # SISweb for Na2: 45.97954 100 # minus 1 H ! +KCl <- list(name= "KCl", mass=74, isotope=list(mass=c(-0.06744,0.92961,-0.06744,0.92772)), abundance=c(0.7047,0.2283,0.0507,0.0162)) # SISweb: 73.93256 100 75.92961 32.4 75.93067 7.2 77.92772 2.3 +H2PO4 <- list(name= "H2PO4", mass=97, isotope=list(mass=c(-0.03091)), abundance=c(1)) +HSO4 <- list(name= "HSO4", mass=97, isotope=list(mass=c(-0.04042,0,-0.04462)), abundance=c(0.96,0,0.04)) +Met2 <- list(name= "Met2", mass=64, isotope=list(mass=c(1.060265,1.013405)), abundance=c(0.978,0.022)) +Met3 <- list(name= "Met3", mass=96, isotope=list(mass=c(1.086485,1.089845)), abundance=c(0.969,0.031)) +Met4 <- list(name= "Met4", mass=128, isotope=list(mass=c(1.112705,1.116065)), abundance=c(0.959,0.041)) +Met5 <- list(name= "Met5", mass=160, isotope=list(mass=c(1.20935,1.142285)), abundance=c(0.949,0.051)) +NaminH <- list(name= "Na-H", mass=21, isotope=list(mass=c(-0.02571416)), abundance=c(1)) +KminH <- list(name= "K-H", mass=37, isotope=list(mass=c(-0.05194,0.94617)), abundance=c(0.9328,0.0672)) +H2O <- list(name= "H2O", mass=-19, isotope=list(mass=c(-0.01894358)), abundance=c(1)) +NaK <- list(name= "NaK-H", mass=61, isotope=list(mass=c(-0.054345,0.943765)), abundance=c(0.9328,0.0672)) +min2H <- list(name= "min2H", mass=-2, isotope=list(mass=c(-0.0151014)), abundance=c(1)) +plus2H <- list(name= "plus2H", mass=2, isotope=list(mass=c(0.0151014)), abundance=c(1)) +plus2Na <- list(name= "plus2Na", mass=46, isotope=list(mass=c(-0.02046), abundance=c(1))) +plusNaH <- list(name= "plusNaH", mass=24, isotope=list(mass=c(-0.00295588), abundance=c(1))) +plusKH <- list(name= "plusKH", mass=40, isotope=list(mass=c(-0.029008,0.969101)), abundance=c(0.9328,0.0672)) +plusHNH <- list(name= "plusHNH", mass=19, isotope=list(mass=c(0.04164642)), abundance=c(1)) +min3H <- list(name= "min3H", mass=-3, isotope=list(mass=c(-0.02182926)), abundance=c(1)) +plus3H <- list(name= "plus3H", mass=3, isotope=list(mass=c(0.02182926)), abundance=c(1)) +plus3Na <- list(name= "plus3Na", mass=68, isotope=list(mass=c(0.96931)), abundance=c(1)) +plus2NaH <- list(name= "plus2NaH", mass=47, isotope=list(mass=c(0.2985712)), abundance=c(1)) +plusNa2H <- list(name= "plusNa2H", mass=25, isotope=list(mass=c(0.00432284)), abundance=c(1)) + +Cl37 <- list(name= "Cl37", mass=37, isotope=list(mass=c(-0.03409741)), abundance=c(1)) + +allelements <- list(Ba, Br, C, Ca, Cl, Cr, Cu, F, Fe, H, Hg, I, K, Li, Mg, Mn, N, Na, Ni, O, P, S, Se, Si, Sn, Zn) +allAdducts <- list(Ba, Br, Ca, Cl, Cl37, Cr, Cu, Fe, Hg, I, K, Li, Mg, Mn, Na, Ni, Se, Si, Sn, Zn, NH4, Ac, NaCl, For,Na2,CH3OH,NaCl2,NaCl3,NaCl4,NaCl5,Na3,KCl,H2PO4,HSO4,Met2,Met3,Met4,Met5,NaminH,KminH,H2O,NaK,min2H,plus2H,plus2Na,plusNaH,plusKH,min3H,plus3H,plusHNH,plus3Na,plus2NaH,plusNa2H) + +#atomsinuse <- c("P", "O", "N", "C", "H", "S", "Cl") +#atomicWts <- c(30.97376163, 15.99491463, 14.0030740052, 12.0000, 1.0078250321, 31.9720707, 34.968852721) +#electron <- 0.00054858 +#Mol.comp <- c(0,4,0,0,1,1,0) +#Mol.exact <- sum(Mol.comp * atomicWts) + electron +#Mol.corr <- Mol.exact - 0.0022 + 0.000007*Mol.exact # mass as found in peak group list +#getMass(Mol) + electron - 0.0022 + 0.000007*getMass(Mol) + +#atomsinuse <- c("P", "O", "N", "C", "H", "S", "Cl", "D", "34S", "18O") +#atomicWts <- c(30.97376163, 15.99491463, 14.0030740052, 12.0000, 1.0078250321, 31.9720707, 34.968852721, 2.0141017778, 33.96786690, 17.9991610) +#electron <- 0.00054858 +#Mol.comp <- c(0,4,0,0,1,1,0,0,0,0) # main peak HSO4 +#Mol.comp <- c(0,4,0,0,0,1,0,1,0,0) # deuterated +#Mol.comp <- c(0,4,0,0,1,0,0,0,1,0) # 34S +#Mol.comp <- c(0,3,0,0,1,1,0,0,0,1) # 18O +#Mol.exact <- sum(Mol.comp * atomicWts) + electron +#Mol.corr <- Mol.exact - 0.0022 + 0.000007*Mol.exact # mass as found in peak group list + +atomsinuse <- c("P", "O", "N", "C", "H", "S", "Cl", "D", "13C", "34S", "18O", "37Cl") +atomicWts <- c(30.97376163, 15.99491463, 14.0030740052, 12.0000, 1.0078250321, 31.9720707, 34.968852721, 2.0141017778, 13.0033548378, 33.96786690, 17.9991610, 36.96590259) +electron <- 0.00054858 +############# P O N C H S Cl D 13C 34S 18O 37Cl +#Mol.comp <- c(0,6,0,6,12,0,1, 0, 0, 0, 0, 0) # main peak Galactose HCl negative ion +#Mol.comp <- c(0,6,0,5,12,0,1, 0, 1, 0, 0, 0) # 13C Galactose HCl negative ion +#Mol.comp <- c(0,6,0,6,11,0,1, 1, 0, 0, 0, 0) # D Galactose HCl negative ion +#Mol.comp <- c(0,5,0,6,12,0,1, 0, 0, 0, 1, 0) # 18O Galactose HCl negative ion +#Mol.comp <- c(0,6,0,6,12,0,0, 0, 0, 0, 0, 1) # 18O Galactose HCl negative ion +#Mol.exact <- sum(Mol.comp * atomicWts) + electron +#Mol.corr <- Mol.exact - 0.0022 + 0.000007*Mol.exact # mass as found in peak group list + +Hmass <- H$mass + H$isotope$mass[1] +Dmass <- H$mass + 1 + H$isotope$mass[2] +Tmass <- H$mass + 2 + H$isotope$mass[3] +C13mass <- C$mass + 1 + C$isotope$mass[2] +N15mass <- N$mass + 1 + N$isotope$mass[2] diff --git a/DIMS/AddOnFunctions/iden.code.R b/DIMS/AddOnFunctions/iden.code.R new file mode 100644 index 0000000..4853647 --- /dev/null +++ b/DIMS/AddOnFunctions/iden.code.R @@ -0,0 +1,45 @@ +iden.code <- function(peaklist, db, ppm, theor_mass_label) { + # theor_mass_label = {"MNeg", "Mpos"} + + mcol <- peaklist[ , "mzmed.pgrp"] + theor.mcol <- db[,theor_mass_label] + assi_HMDB <- iso_HMDB <- HMDB_code <- c(rep("",length(mcol))) + theormz_HMDB <- c(rep(0,length(mcol))) + + for(t in 1:length(mcol)){ + mz<-mcol[t] + mtol <- mz*ppm/1000000 + selp <- which(theor.mcol > (mz - mtol) & theor.mcol < (mz + mtol)) + assinames <- isonames <- HMDBcodes <- "" + + if(length(selp)!=0){ + for(i in 1:length(selp)){ + + if(grepl(" iso ", db[selp[i], "CompoundName"])){ + mainpeak <- strsplit(db[selp[i],"CompoundName"]," iso ")[[1]][1] + + # Check if peak without isotope occurs, this assumes that peaklist is ordered on mass!!! + if(length(grep(mainpeak, assi_HMDB, fixed=TRUE)) > 0){ + isonames <- as.character(paste(isonames,as.character(db[selp[i],"CompoundName"]), sep=";")) + } + + } else { + assinames <- as.character(paste(assinames,as.character(db[selp[i], "CompoundName"]), sep=";")) + HMDBcodes <- as.character(paste(HMDBcodes, as.character(rownames(db)[selp[i]]), sep=";")) + } + } + } + + assi_HMDB[t] <- as.character(assinames) + iso_HMDB[t] <- as.character(isonames) + if ((assinames=="") & (isonames=="")){ + theormz_HMDB[t] <- "" + } else { + theormz_HMDB[t] <- theor.mcol[selp[1]] + } + HMDB_code[t] <- as.character(HMDBcodes) + } + + return(cbind(peaklist, assi_HMDB, iso_HMDB, theormz_HMDB, HMDB_code)) +} + diff --git a/DIMS/AddOnFunctions/ident.hires.noise.HPC.R b/DIMS/AddOnFunctions/ident.hires.noise.HPC.R new file mode 100644 index 0000000..0be43d1 --- /dev/null +++ b/DIMS/AddOnFunctions/ident.hires.noise.HPC.R @@ -0,0 +1,242 @@ +# modified identify function to also look for adducts and their isotopes +ident.hires.noise.HPC <- function(peaklist, allAdducts, scanmode="Negative", look4=c("Cl", "Ac"), identlist=NULL, resol=280000, slope=0, incpt=0, ppm.fixed, ppm.iso.fixed) { + + # peaklist=outlist_Neg + # scanmode="Negative" + # identlist=noise.Neg.MZ + # look4=look4.addN + # resol=resol + # slope=0 + # incpt=0 + # ppm.fixed=3 + # ppm.iso.fixed=2 + + metlin <- assi <- iso <- rep("", nrow(peaklist)) + theormz <- nisos <- expint <- conf <- rep(0, nrow(peaklist)) + # nrH <- nrD <- nrC <- nr13C <- nrN <- nr15N <- nrO <- nrP <- nrS <- nrCl <- rep(0, nrow(peaklist)) + + # add adducts to identification list + if (scanmode == "Positive") { add.mode <- "+" } else { add.mode <- "-" } + # identlist <- theorMZ + identlist.orig <- identlist + + for (p in 1:length(look4)) { # loop over type of adduct + #print(p) + identlist.Adduct <- identlist.orig + identlist.Adduct[ , "CompoundName"] <- as.character(identlist.orig[ , "CompoundName"]) + + #add2label <- paste("[M+", look4[p], "]", add.mode, sep="") + if (look4[p]=="H2O") { + add2label <- paste("[M-", look4[p], "]", add.mode, sep="") + } else { + add2label <- paste("[M+", look4[p], "]", add.mode, sep="") + } + + identlist.Adduct[ , "CompoundName"] <- paste(identlist.Adduct[ , "CompoundName"], add2label, sep=" ") + adductInfo <- elementInfo(look4[p], allAdducts) + if (scanmode == "Positive") { + adductMass <- adductInfo$mass[1] + adductInfo$isotope$mass[1] - Hmass } else { + adductMass <- adductInfo$mass[1] + adductInfo$isotope$mass[1] + Hmass + } + for (q in 1:nrow(identlist.Adduct)) { # loop over compounds in database + # construct information for compound + adduct: + if (scanmode == "Positive") { + identlist.Adduct[q, "Mpos"] <- as.numeric(identlist.Adduct[q, "Mpos"]) + adductMass # + Na - H + identlist.Adduct[q, "MNeg"] <- 0 + } else { + identlist.Adduct[q, "Mpos"] <- 0 + identlist.Adduct[q, "MNeg"] <- as.numeric(identlist.Adduct[q, "MNeg"]) + adductMass # + Cl + H + } + } + + # # modify columns with info for mol. formula: + # if (look4[p] == "2Na-H") { + # identlist.Adduct[, "nrH"] <- as.numeric(identlist.Adduct[, "nrH"]) - 1 + # } else if (look4[p] == "NH4") { + # identlist.Adduct[, "nrH"] <- as.numeric(identlist.Adduct[, "nrH"]) + 3 + # identlist.Adduct[, "nrN"] <- as.numeric(identlist.Adduct[, "nrN"]) + 1 + # } else if (look4[p] == "Cl") { + # identlist.Adduct[, "nrCl"] <- as.numeric(identlist.Adduct[, "nrCl"]) + 1 + # identlist.Adduct[, "nrH"] <- as.numeric(identlist.Adduct[, "nrH"]) + 1 + # } else if (look4[p] == "Ac") { + # identlist.Adduct[ , "nrC"] <- as.numeric(identlist.Adduct[ , "nrC"]) + 2 + # identlist.Adduct[ , "nrH"] <- as.numeric(identlist.Adduct[ , "nrH"]) + 3 + # identlist.Adduct[ , "nrO"] <- as.numeric(identlist.Adduct[ , "nrO"]) + 2 + # } else if (look4[p] == "CH3OH+H") { + # identlist.Adduct[ , "nrC"] <- as.numeric(identlist.Adduct[ , "nrC"]) + 1 + # identlist.Adduct[ , "nrH"] <- as.numeric(identlist.Adduct[ , "nrH"]) + 4 + # identlist.Adduct[ , "nrO"] <- as.numeric(identlist.Adduct[ , "nrO"]) + 1 + # } else { + # identlist.Adduct[, "nrH"] <- as.numeric(identlist.Adduct[, "nrH"]) - 1 + # } + identlist <- rbind(identlist, identlist.Adduct) + } # end for p adducts in look4 + + + if (scanmode == "Positive") { theor.mcol <- as.numeric(identlist[ , "Mpos"]) } else { + theor.mcol <- as.numeric(identlist[ , "MNeg"]) } + # apply correction using regression line obtained with ISses + theor.mcol <- (1+slope)*theor.mcol + incpt + + # get mz information from peaklist + mcol <- peaklist[ , "mzmed.pgrp"] + # if column with average intensities is missing, calculate it: + if (!("avg.int" %in% colnames(peaklist))){ + mzmaxcol <- which(colnames(peaklist) == "mzmax.pgrp") + endcol <- ncol(peaklist) + peaklist[ , "avg.int"] <- apply(peaklist[ ,(mzmaxcol+1):(endcol)], 1, mean) + } + + # # generate URL for Metlin: + # for (p in 1:nrow(peaklist)) { # for each peak # p <- 1 + # mzpeak <- as.numeric(mcol[p]) + # # resolution as function of mz: + # resol.mz <- resol*(1/sqrt(2)^(log2(mzpeak/200))) + # fwhm <- mzpeak/resol.mz + # # massmin <- mzpeak - (0.00003 + 0.000003*mzpeak) - (1.0078250321 - 0.00054858) - fwhm # for Metlin db + # # massmax <- mzpeak - (0.00003 + 0.000003*mzpeak) - (1.0078250321 - 0.00054858) + fwhm # for Metlin db + # if (scanmode == "Positive") { + # massmin <- mzpeak - (1.0078250321 - 0.00054858) - fwhm # for Metlin db + # massmax <- mzpeak - (1.0078250321 - 0.00054858) + fwhm # for Metlin db + # } else { + # massmin <- mzpeak + (1.0078250321 - 0.00054858) - fwhm # for Metlin db + # massmax <- mzpeak + (1.0078250321 - 0.00054858) + fwhm # for Metlin db + # } + # metlin[p] <- paste("http://metlin.scripps.edu/metabo_list.php?mass_min=", massmin, "&mass_max=", massmax, sep="") + # } + + + # do indentification using own database: + for (t in 1:nrow(identlist)) { # theoretical mass # t <- 45 + # for (t in 1:169) { + #print(as.character(identlist[t,"CompoundName"])) + theor.mz <- theor.mcol[t] + + theor.comp <- as.character(identlist[t, "Composition"]) + #theor.comp <- mol.formula(identlist[t, ]) + + # # if there's Deuterium, Tritium, 13C or 15N in the composition: + # mass.incr <- 0 + (as.numeric(identlist[t,"nrD"])*Dmass) + + # (as.numeric(identlist[t,"nrT"])*Tmass) + + # (as.numeric(identlist[t,"nrC13"])*C13mass) + + # (as.numeric(identlist[t,"nrN15"])*N15mass) + # theor.comp <- strsplit(theor.comp, "iso")[[1]][1] + mass.incr <- 0 + + # resolution as function of mz: + resol.mz <- resol*(1/sqrt(2)^(log2(theor.mz/200))) + # calculate fine-grained isotopic distribution using MIDAs + fwhm <- round(theor.mz/resol.mz,6) + + #system(paste("C:/Users/awillem7/tools/MIDAs_New/MIDAs_Example ", theor.comp, " 2 C00 \"\" \"\" ", fwhm, " 1 0 1e-50 2 tmp", sep=""),ignore.stderr=TRUE) + #system(paste("C:/Users/mraves/Metabolomics/MIDAs_New/MIDAs_Example ", theor.comp, " 2 C00 \"\" \"\" ", fwhm, " 1 0 1e-50 2 tmp", sep=""),ignore.stderr=TRUE) + #system(paste(path2MIDAS, theor.comp, " 2 C00 \"\" \"\" ", fwhm, " 1 0 1e-50 2 tmp", sep=""),ignore.stderr=TRUE) + + #system(paste("/data/home/luyf/Metabolomics/MIDAs/MIDAs_Example ", theor.comp, " 2 C00 \"\" \"\" ", fwhm, " 1 0 1e-50 2 tmp", sep=""),ignore.stderr=TRUE) + options(stringsAsFactors = FALSE) + #fgid <- read.table(file="tmp_Fine_Grained_Isotopic_Distribution", header=FALSE) + + # res <- try(fgid <- read.table(file="tmp_Fine_Grained_Isotopic_Distribution", header=FALSE)) + # if(inherits(res, "try-error")) + # { + # #error handling code, maybe just skip this iteration using + # message("Skipped") + # next + # } + # + # # correct mass for D, T, 13C and 15N + # fgid[ ,1] <- as.numeric(fgid[ ,1]) + mass.incr + # # calculate percentage intensities from relative intensities + # firstone <- as.numeric(fgid[1,2]) + # fgid[ ,3] <- as.numeric(fgid[ , 2]) / firstone + # #fgid <- as.matrix(fgid, ncol=3) + # # the mz in the MIDAs file are of the neutral molecule + # if (scanmode == "Positive") { mz.iso <- as.numeric(fgid[ , 1]) + Hmass - electron } + # if (scanmode == "Negative") { mz.iso <- as.numeric(fgid[ , 1]) - Hmass + electron } + # fgid <- cbind(fgid, mz.iso) + # colnames(fgid) <- c("mz","rel.int", "perc.int", "mz.iso") + + # compensate mz for presence of adduct + # Adduct.mass <- theor.mz - mz.iso[1] + # fgid[ , "mz.iso"] <- as.numeric(fgid[ , "mz.iso"]) + Adduct.mass + + # set tolerance for mz accuracy of main peak + mtol <- theor.mz*ppm.fixed/1000000 + # find main peak + selp <- which(mcol > (theor.mz - mtol) & mcol < (theor.mz + mtol)) + # selp <- which(mcol > (theor.mz - 0.01) & mcol < (theor.mz + 0.01)) + # peaklist[selp, c(1:4,(endcol+1))] + + # set tolerance for mz accuracy of isotope peaks + itol <- theor.mz*ppm.iso.fixed/1000000 + + # if (length(selp) > 1) { # more than one candidate peak for main; select best one based on isotope pattern + # #cat(as.character(identlist[t, "CompoundName"])); print(" has >1 candidate peaks") + # conf.local <- rep(0, length(selp)) + # for (p in 1:length(selp)) { # p <- 2 + # # determine isotope pattern for each candidate peak + # # obs.mz <- peaklist[selp[p],"mzmed.pgrp"] + # conf.local[p] <- match.isotope.pattern(peaklist, scanmode, selp[p], fgid, ppm.iso.fixed) + # } + # # selp <- selp[abs(mcol[selp] - theor.mz) == min(abs(mcol[selp] - theor.mz))] } + # selp <- na.exclude(selp[conf.local == max(conf.local)]) + # } + if (length(selp) > 1) { # more than one candidate peak for main; select best one based on mz.diff + selp <- selp[abs(mcol[selp] - theor.mz) == min(abs(mcol[selp] - theor.mz))] + } + if (length(selp) == 1) { # match for main + assi[selp] <- paste(assi[selp], as.character(identlist[t,"CompoundName"]), sep=";") + theormz[selp] <- theor.mz + # conf[selp] <- match.isotope.pattern(peaklist, scanmode, selp, fgid, ppm.iso.fixed) + + # nrH[selp] <- identlist[t,"nrH"] + # nrD[selp] <- identlist[t,"nrD"] + # nrC[selp] <- identlist[t,"nrC"] + # nr13C[selp] <- identlist[t,"nrC13"] + # nrN[selp] <- identlist[t,"nrN"] + # nr15N[selp] <- identlist[t,"nrN15"] + # nrO[selp] <- identlist[t,"nrO"] + # nrP[selp] <- identlist[t,"nrP"] + # nrS[selp] <- identlist[t,"nrS"] + # nrCl[selp] <- identlist[t,"nrCl"] + + # assign isotope peaks + # mz.main <- peaklist[selp, "mzmed.pgrp"] # mz of main peak + # int.main <- peaklist[selp, "avg.int"] # intensity of main peak (= 100%) + # # deviation from theoretical mass: + # diff <- theor.mz - mz.main + # # calculate expected intensities and select isotopes with exp.int > threshold + # fgid[ , "exp.int"] <- fgid[ , "perc.int"] * int.main + # fgid.subset <- fgid[(fgid[ , "exp.int"] > thresh), ] + # nisos[selp] <- nrow(fgid.subset) - 1 + # if (nrow(fgid.subset) > 1) { # avoid error message if fgid.subset has only 1 line + # for (f in 2:nrow(fgid.subset)) { # f <- 2 + # mz.target <- fgid.subset[f, "mz.iso"] - diff + # int.target <- fgid.subset[f, "exp.int"] + # # print(itarget) + # sel.iso <- peaklist[ , "mzmed.pgrp"] > (mz.target - itol) & peaklist[ , "mzmed.pgrp"] < (mz.target + itol) + # # sum(sel.iso) + # # sel.iso <- peaklist[ , "mzmed.pgrp"] > (mz.target - 0.01) & peaklist[ , "mzmed.pgrp"] < (mz.target + 0.01) + # # peaklist[sel.iso, c(1:4,23)] + # if (sum(sel.iso) == 1) { # 2 separate if-statements because of error if sum(sel.iso) = 0 + # if (peaklist[sel.iso, "avg.int"] > (int.target/2)) { # match + # iso[sel.iso] <- paste(paste(iso[sel.iso], as.character(identlist[t,"CompoundName"]), "iso", f, sep=" "),";", sep="") + # # peaklist[sel.iso, ] + # expint[sel.iso] <- fgid.subset[f, "exp.int"] + # } + # } else if (sum(sel.iso) > 1) { + # nrs.iso <- which(sel.iso) + # nr.iso <- nrs.iso[which(abs(peaklist[sel.iso, "avg.int"] - int.target) == min(abs(peaklist[sel.iso, "avg.int"] - int.target)))] + # if (peaklist[nr.iso, "avg.int"] > (int.target/2)) { # match + # # print(peaklist[nr.iso, "avg.int"]) + # iso[nr.iso] <- paste(paste(iso[nr.iso], as.character(identlist[t,"CompoundName"]), "iso", f, sep=" "),";", sep="") + # expint[nr.iso] <- fgid.subset[f, "exp.int"] + # } # end if + # } # end else if + # } # end for f + # } # end if + } # end if + } # end for t + # cbind(peaklist, nrH, nrD, nrC, nr13C, nrN, nr15N, nrO, nrP, nrS, nrCl, assi, theormz, conf, nisos, iso, expint, metlin) + cbind(peaklist, assi, theormz, conf, nisos, iso, expint, metlin) +} diff --git a/DIMS/AddOnFunctions/isWithinXppm.R b/DIMS/AddOnFunctions/isWithinXppm.R new file mode 100644 index 0000000..5da43c6 --- /dev/null +++ b/DIMS/AddOnFunctions/isWithinXppm.R @@ -0,0 +1,51 @@ +isWithinXppm <- function(mean, scale, sigma, area, x2, x, ppm=4, resol, plot) { + # mean=retVal$mean + # scale=retVal$scale + # sigma=retVal$sigma + # area=retVal$area + # ppm=3 + + # sort!!!!!!!!!!!!!!!! + index = order(mean) + mean = mean[index] + scale = scale[index] + sigma = sigma[index] + area = area[index] + + summed = NULL + remove = NULL + + if (length(mean)>1){ + for (i in 2:length(mean)){ + if ((abs(mean[i-1]-mean[i])/mean[i-1])*10^6 < ppm) { + + # avoid double occurance in sum + if ((i-1) %in% summed) next + + retVal = sumCurves(mean[i-1], mean[i], scale[i-1], scale[i], sigma[i-1], sigma[i], x2, x, resol, plot) + summed = c(summed, i-1, i) + if (is.nan(retVal$mean)) retVal$mean=0 + mean[i-1] = retVal$mean + mean[i] = retVal$mean + area[i-1] = retVal$area + area[i] = retVal$area + scale[i-1] = retVal$scale + scale[i] = retVal$scale + sigma[i-1] = retVal$sigma + sigma[i] = retVal$sigma + + remove = c(remove, i) + } + } + } + + if (length(remove)!=0){ + mean=mean[-c(remove)] + area=area[-c(remove)] + scale=scale[-c(remove)] + sigma=sigma[-c(remove)] + } + + return(list("mean"=mean, "area"=area, "scale"=scale, "sigma"=sigma, "qual"=NULL)) + +} \ No newline at end of file diff --git a/DIMS/AddOnFunctions/is_diagnostic_patient.R b/DIMS/AddOnFunctions/is_diagnostic_patient.R new file mode 100644 index 0000000..839f41d --- /dev/null +++ b/DIMS/AddOnFunctions/is_diagnostic_patient.R @@ -0,0 +1,6 @@ +is_diagnostic_patient <- function(patient_column){ + # Check for Diagnostics patients with correct patientnumber (e.g. starting with "P2024M") + diagnostic_patients <- grepl("^P[0-9]{4}M",patient_column) + + return(diagnostic_patients) +} \ No newline at end of file diff --git a/DIMS/AddOnFunctions/mergeDuplicatedRows.R b/DIMS/AddOnFunctions/mergeDuplicatedRows.R new file mode 100644 index 0000000..39cb8b4 --- /dev/null +++ b/DIMS/AddOnFunctions/mergeDuplicatedRows.R @@ -0,0 +1,49 @@ +mergeDuplicatedRows <- function(peaklist) { + # peaklist = outlist.tot + # resultDir = "./results" + # scanmode = "positive" + + # peaklist_index=which(peaklist[,"mzmed.pgrp"]=="94.9984524624111") + # peaklist[peaklist_index,] + + collapse <- function(label,pklst,index){ + # label = "iso_HMDB" + # pklst = peaklist + # index = peaklist_index + tmp2=as.vector(pklst[index,label]) + if (length(which(is.na(tmp2)))>0) tmp2=tmp2[-which(is.na(tmp2))] + return(paste(tmp2,collapse = ";")) + } + + options(digits=16) + + collect=NULL + remove=NULL + + index = which(duplicated(peaklist[, "mzmed.pgrp"])) + + while (length(index) > 0){ + + peaklist_index = which(peaklist[, "mzmed.pgrp"] == peaklist[index[1], "mzmed.pgrp"]) + # peaklist[peaklist_index,"iso_HMDB",drop=FALSE] + tmp=peaklist[peaklist_index[1],,drop=FALSE] + + tmp[,"assi_HMDB"]=collapse("assi_HMDB",peaklist,peaklist_index) + tmp[,"iso_HMDB"]=collapse("iso_HMDB",peaklist,peaklist_index) + tmp[,"HMDB_code"]=collapse("HMDB_code",peaklist,peaklist_index) + tmp[,"assi_noise"]=collapse("assi_noise",peaklist,peaklist_index) + if (tmp[,"assi_noise"]==";") tmp[,"assi_noise"]=NA + tmp[,"theormz_noise"]=collapse("theormz_noise",peaklist,peaklist_index) + if (tmp[,"theormz_noise"]=="0;0") tmp[,"theormz_noise"]=NA + + collect = rbind(collect, tmp) + remove = c(remove, peaklist_index) + + index=index[-which(peaklist[index, "mzmed.pgrp"] == peaklist[index[1], "mzmed.pgrp"])] + } + + if (!is.null(remove)) peaklist = peaklist[-remove,] + peaklist = rbind(peaklist,collect) + + return(peaklist) +} diff --git a/DIMS/AddOnFunctions/normalization_2.1.R b/DIMS/AddOnFunctions/normalization_2.1.R new file mode 100644 index 0000000..ed99964 --- /dev/null +++ b/DIMS/AddOnFunctions/normalization_2.1.R @@ -0,0 +1,75 @@ +# normalization_2.1(outlist.pos.id, "Intensity_all_peaks_positive_norm", groupNames.pos, on="total", assi_label="assi_HMDB") + +normalization_2.1 <- function(data, filename, groupNames, on="total", assi_label="assi"){ + # data=outlist.pos.id + # filename = "Intensity_all_peaks_positive_norm" + # groupNames = groupNames.pos + # on="total" + # assi_label="assi_HMDB" + + lastcol = length(groupNames) + 6 + before = data[ ,c(7:lastcol)] + + if (on=="total_IS") { + + assi = which(colnames(data)==assi_label) + data.int <- data[ ,c(assi,7:lastcol)] # assi and samples columns + data.assi = data.int[grep("(IS", data.int[,1], ignore.case=FALSE, fixed = TRUE),] + + } else if (on=="total_ident"){ + + assi = which(colnames(data)==assi_label) + assi.hmdb = which(colnames(data)=="assi.hmdb") + index = sort(union(which(data[,assi]!=""), which(data[,assi.hmdb]!=""))) + + data.int = data[ ,c(assi,7:lastcol)] # assi and samples columns + data.assi = data.int[index,] + + } else if (on=="total") { + + assi = which(colnames(data)==assi_label) + data.int = data[ ,c(assi,7:lastcol)] # assi and samples columns + data.assi = data.int + + } + + sum <- 0 + for (c in 2:ncol(data.assi)) { + sum <- sum + sum(as.numeric(data.assi[,c])) + } + average <- sum/(ncol(data.assi)-1) + for (c in 2:ncol(data.int)) { + factor <- sum(as.numeric(data.assi[,c]))/average + if (factor==0) { + data.int[ ,c]=0 + cat(colnames(data.int)[c]) + cat("factor==0 !!!") + } else { + data.int[ ,c] <- as.numeric(data.int[ ,c])/factor + } + } + + # colnames(data.int[,2:ncol(data.int)]) + + if (dim(data)[2]==lastcol){ + final.outlist.Pos.idpat.norm <- cbind(data[,1:6],data.int[,2:ncol(data.int)]) + } else { + final.outlist.Pos.idpat.norm <- cbind(data[,1:6],data.int[,2:ncol(data.int)],data[,(lastcol + 1):ncol(data)]) + } + + #outdir="./results/normalization" + #dir.create(outdir, showWarnings = FALSE) + + #CairoPNG(filename=paste(outdir, paste(filename, "_before.png", sep=""), sep="/"), width, height) + #sub=apply(before,2, function(x) sum(as.numeric(x))) + #barplot(as.vector(unlist(sub)), main="Not normalized",names.arg = colnames(before),las=2) + #dev.off() + + #CairoPNG(filename=paste(outdir, paste(filename, "_", on, ".png", sep=""), sep="/"), width, height) + #sub=apply(final.outlist.Pos.idpat.norm[,c(7:lastcol)],2, function(x) sum(as.numeric(x))) + #barplot(as.vector(unlist(sub)), main=filename,names.arg = colnames(final.outlist.Pos.idpat.norm[,c(7:lastcol)]),las=2) + #dev.off() + + return(final.outlist.Pos.idpat.norm) + +} diff --git a/DIMS/AddOnFunctions/optimizeGauss.R b/DIMS/AddOnFunctions/optimizeGauss.R new file mode 100644 index 0000000..0fbce75 --- /dev/null +++ b/DIMS/AddOnFunctions/optimizeGauss.R @@ -0,0 +1,11 @@ +optimizeGauss <- function(x,y,sigma,mu) { + + f = function(p,x,y,sigma,mu) { + curve = p*dnorm(x,mu,sigma) + return((max(curve)-max(y))^2) + } + + rval = optimize(f, c(0, 100000), tol = 0.0001,x,y,sigma,mu) + + return(rval$minimum) +} diff --git a/DIMS/AddOnFunctions/output_helix.R b/DIMS/AddOnFunctions/output_helix.R new file mode 100644 index 0000000..c098675 --- /dev/null +++ b/DIMS/AddOnFunctions/output_helix.R @@ -0,0 +1,31 @@ +output_for_helix <- function(protocol_name, df_metabs_helix){ + + # Remove positive controls + df_metabs_helix <- df_metabs_helix %>% filter(is_diagnostic_patient(Patient)) + + # Add 'Vial' column, each patient has unique ID + df_metabs_helix <- df_metabs_helix %>% + group_by(Patient) %>% + mutate(Vial = cur_group_id()) %>% + ungroup() + + # Split patient number into labnummer and Onderzoeksnummer + df_metabs_helix <- add_lab_id_and_onderzoeksnummer(df_metabs_helix) + + # Add column with protocol name + df_metabs_helix$Protocol <- protocol_name + + # Change name Z_score and Helix_naam columns to Amount and Name + change_columns <- c(Amount = "Z_score", Name = "Helix_naam") + df_metabs_helix <- df_metabs_helix %>% rename(all_of(change_columns)) + + # Select only necessary columns and set them in correct order + df_metabs_helix <- df_metabs_helix %>% + select(c(Vial, labnummer, Onderzoeksnummer, Protocol, Name, Amount)) + + # Remove duplicate patient-metabolite combinations ("leucine + isoleucine + allo-isoleucin_Z-score" is added 3 times) + df_metabs_helix <- df_metabs_helix %>% + group_by(Onderzoeksnummer, Name) %>% distinct() %>% ungroup() + + return(df_metabs_helix) +} \ No newline at end of file diff --git a/DIMS/AddOnFunctions/peak.grouping.Gauss.HPC.R b/DIMS/AddOnFunctions/peak.grouping.Gauss.HPC.R new file mode 100644 index 0000000..c36259a --- /dev/null +++ b/DIMS/AddOnFunctions/peak.grouping.Gauss.HPC.R @@ -0,0 +1,88 @@ +peak.grouping.Gauss.HPC <- function(outdir, fileIn, scanmode, resol, groupNames) { + # fileIn="./results/specpks_all/positive_outlist_i_min_1.197.RData" + + # ppm/2 + #range = 1.5e-06 + range = 2e-06 + startcol=7 + + # outlist.copy <- read.table(file=fileIn, sep="\t", header=TRUE) + load(fileIn) + outlist.copy = outlist_i_min_1 + batch = strsplit(fileIn, ".",fixed = TRUE)[[1]][3] + + outpgrlist = NULL + + while (max(as.numeric(outlist.copy[ , "height.pkt"])) > 0 ) { + + sel = which(as.numeric(outlist.copy[ , "height.pkt"]) == max(as.numeric(outlist.copy[ , "height.pkt"])))[1] + + # 3ppm range around max + mzref = as.numeric(outlist.copy[sel, "mzmed.pkt"]) + pkmin = -(range*mzref - mzref) + pkmax = 2*mzref-pkmin + + selp = as.numeric(outlist.copy[ , "mzmed.pkt"]) > pkmin & as.numeric(outlist.copy[ , "mzmed.pkt"]) < pkmax + tmplist = outlist.copy[selp, ] + + nrsamples = sum(selp) + if (nrsamples > 1) { + # 3ppm range around mean + mzmed.pgrp = mean(as.numeric(outlist.copy[selp, "mzmed.pkt"])) + mzmin.pgrp = -(range*mzmed.pgrp - mzmed.pgrp) + mzmax.pgrp = 2*mzmed.pgrp - mzmin.pgrp + + selp = as.numeric(outlist.copy[ , "mzmed.pkt"]) > mzmin.pgrp & as.numeric(outlist.copy[ , "mzmed.pkt"]) < mzmax.pgrp + tmplist = outlist.copy[selp, ] + + fq.worst.pgrp = as.numeric(max(outlist.copy[selp, "fq"])) + fq.best.pgrp = as.numeric(min(outlist.copy[selp, "fq"])) + ints.allsamps = rep(0, length(groupNames)) + names(ints.allsamps) = groupNames # same order as sample list!!! + + # if (length(unique(as.vector(tmplist[,"samplenr"]))) != length(as.vector(tmplist[,"samplenr"]))) { + # message(paste("bingo", sel)) + # break + # } + + # Check for each sample if multiple peaks exists, if so take the sum! + labels=unique(tmplist[,"samplenr"]) + ints.allsamps[labels] = as.vector(unlist(lapply(labels, function(x) {sum(as.numeric(tmplist[which(tmplist[ , "samplenr"]==x), "height.pkt"]))}))) + + outpgrlist = rbind(outpgrlist, c(mzmed.pgrp, fq.best.pgrp, fq.worst.pgrp, nrsamples, mzmin.pgrp, mzmax.pgrp, ints.allsamps)) + } + outlist.copy[selp, "height.pkt"] = -1 + } + + outpgrlist = as.data.frame(outpgrlist) # ignore warnings of duplicate row names + colnames(outpgrlist)[1:6] = c("mzmed.pgrp", "fq.best", "fq.worst", "nrsamples", "mzmin.pgrp", "mzmax.pgrp") + + # # filtering ################################################################################################################## + # final.outlist=outpgrlist[,c("mzmed.pgrp", "fq.best", "fq.worst","nrsamples","mzmin.pgrp","mzmax.pgrp", sampleNames)] + # # NB: in centroided mode, data files contains many "-1.000" values, from peak finding. Set these to zero. + # final.outlist[final.outlist == -1] = 0 + # + # # keep only peaks which occur in 3 out of 3 technical replicates in at least one sample in peak group list + # # peakFiltering(repl.pattern, final.outlist, nsampgrps, outdir, scanmode, startcol=7) + # # peakFiltering <- function(repl.pattern, final.outlist, nsampgrps, resultDir, scanmode, startcol){ + # nsamp = length(repl.pattern) + # nsampgrps = length(repl.pattern[[1]]) + # + # keep <- rep(0, nrow(final.outlist)) + # for (p in 1:nrow(final.outlist)) { + # for (g in 1:nsampgrps) { # g <- 31 + # if (keep[p] == 0 & sum(final.outlist[p, repl.pattern[[g]]] > 0) == length(repl.pattern[[g]]) ) { keep[p] <- 1 } + # } + # } + # + # tmp <- cbind(final.outlist, keep) + # final.outlist.filt <- tmp[keep == 1, ] + # + # # omit keep column + # final.outlist.filt <- final.outlist.filt[ , -ncol(final.outlist.filt)] + + #save(outpgrlist_part, file=paste(outdir, paste(scanmode, "_", mzstart, "_", mzend, ".RData", sep=""), sep="/")) + # save(final.outlist.filt, file=paste(outdir, "peak_grouping", paste(scanmode, "_",batch,".RData", sep=""), sep="/")) + save(outpgrlist, file=paste(outdir, "peak_grouping", paste(scanmode, "_",batch,".RData", sep=""), sep="/")) + +} \ No newline at end of file diff --git a/DIMS/AddOnFunctions/prepare_alarmvalues.R b/DIMS/AddOnFunctions/prepare_alarmvalues.R new file mode 100644 index 0000000..c09c536 --- /dev/null +++ b/DIMS/AddOnFunctions/prepare_alarmvalues.R @@ -0,0 +1,52 @@ +prepare_alarmvalues <- function(pt_name, dims_helix_table) { + # extract data for patient of interest (pt_name) + pt_metabs_helix <- dims_helix_table %>% filter(Patient == pt_name) + pt_metabs_helix$Z_score <- round(pt_metabs_helix$Z_score, 2) + + # Make empty dataframes for metabolites above or below alarmvalues + pt_list_high <- data.frame(HMDB_name = character(), Z_score = numeric()) + pt_list_low <- data.frame(HMDB_name = character(), Z_score = numeric()) + + # Loop over individual metabolites + for (metab in unique(pt_metabs_helix$HMDB_name)){ + # Get data for individual metabolite + pt_metab <- pt_metabs_helix %>% filter(HMDB_name == metab) + # print(pt_metab) + + # Check if zscore is positive of negative + if(pt_metab$Z_score > 0) { + # Get specific alarmvalue for metabolite + high_zscore_cutoff_metab <- pt_metabs_helix %>% filter(HMDB_name == metab) %>% pull(high_zscore) + + # If zscore is above the alarmvalue, add to pt_list_high table + if(pt_metab$Z_score > high_zscore_cutoff_metab) { + pt_metab_high <- pt_metab %>% select(HMDB_name, Z_score) + pt_list_high <- rbind(pt_list_high, pt_metab_high) + } + } else { + # Get specific alarmvalue for metabolite + low_zscore_cutoff_metab <- pt_metabs_helix %>% filter(HMDB_name == metab) %>% pull(low_zscore) + + # If zscore is below the alarmvalue, add to pt_list_low table + if(pt_metab$Z_score < low_zscore_cutoff_metab) { + pt_metab_low <- pt_metab %>% select(HMDB_name, Z_score) + pt_list_low <- rbind(pt_list_low, pt_metab_low) + } + } + } + + # sort tables on zscore + pt_list_high <- pt_list_high %>% arrange(desc(Z_score)) + pt_list_low <- pt_list_low %>% arrange(Z_score) + # add lines for increased, decreased + extra_line1 <- c("Increased", "") + extra_line2 <- c("Decreased", "") + # combine the two lists + top_metab_pt <- rbind(extra_line1, pt_list_high, extra_line2, pt_list_low) + # remove row names + rownames(top_metab_pt) <- NULL + # change column names for display + colnames(top_metab_pt) <- c("Metabolite", "Z-score") + + return(top_metab_pt) +} diff --git a/DIMS/AddOnFunctions/prepare_data.R b/DIMS/AddOnFunctions/prepare_data.R new file mode 100644 index 0000000..07a8805 --- /dev/null +++ b/DIMS/AddOnFunctions/prepare_data.R @@ -0,0 +1,42 @@ +prepare_data <- function(metab_list_all, zscore_patients_local) { + # remove "_Zscore" from column (patient) names + colnames(zscore_patients_local) <- gsub("_Zscore", "", colnames(zscore_patients_local)) + # put data into pages, max 20 violin plots per page in PDF + metab_interest_sorted <- list() + metab_category <- c() + for (metab_class_index in 1:length(metab_list_all)) { # "acyl_carnitines" "amino_acids" "crea_gua" + metab_class <- names(metab_list_all)[metab_class_index] + metab_list <- metab_list_all[[metab_class_index]] + if (ncol(metab_list) > 2) { + # third column are the alarm values, so reduce the data frame to 2 columns and save list + metab_list_alarm <- metab_list + metab_list <- metab_list[ , c(1,2)] + } + # make sure that all HMDB_names have 45 characters + for (metab_index in 1:length(metab_list$HMDB_name)) { + if (is.character(metab_list$HMDB_name[metab_index])) { + HMDB_name_separated <- strsplit(metab_list$HMDB_name[metab_index], "")[[1]] + } else { HMDB_name_separated <- "strspliterror" } + if (length(HMDB_name_separated) <= 45) { + HMDB_name_separated <- c(HMDB_name_separated, rep(" ", 45-length(HMDB_name_separated))) + } else { + HMDB_name_separated <- c(HMDB_name_separated[1:42], "...") + } + metab_list$HMDB_name[metab_index] <- paste0(HMDB_name_separated, collapse = "") + } + # find metabolites and ratios in data frame zscore_patients_local + metab_interest <- inner_join(metab_list, zscore_patients_local[-2], by = "HMDB_code") + # remove column "HMDB_code" + metab_interest <- metab_interest[ , -which(colnames(metab_interest) == "HMDB_code")] + # put the data frame in long format + metab_interest_melt <- reshape2::melt(metab_interest, id.vars = "HMDB_name") + # sort on metabolite names (HMDB_name) + sort_order <- order(metab_interest_melt$HMDB_name) + metab_interest_sorted[[metab_class_index]] <- metab_interest_melt[sort_order, ] + metab_category <- c(metab_category, metab_class) + } + names(metab_interest_sorted) <- metab_category + + return(metab_interest_sorted) + +} diff --git a/DIMS/AddOnFunctions/prepare_data_perpage.R b/DIMS/AddOnFunctions/prepare_data_perpage.R new file mode 100644 index 0000000..9f112f3 --- /dev/null +++ b/DIMS/AddOnFunctions/prepare_data_perpage.R @@ -0,0 +1,47 @@ +prepare_data_perpage <- function(metab_interest_sorted, metab_interest_contr, nr_plots_perpage, nr_pat=20, nr_contr=30) { + total_nr_pages <- 0 + metab_perpage <- list() + metab_category <- c() + for (metab_class_index in 1:length(metab_interest_sorted)) { # "acyl_carnitines" "amino_acids" "crea_gua" + # split list into pages, each page containing max nr_plots_perpage (20) compounds + metab_interest_perclass <- metab_interest_sorted[[metab_class_index]] + metab_class <- names(metab_interest_sorted)[metab_class_index] + # add controls + metab_interest_contr_perclass <- metab_interest_contr[[metab_class_index]] + # number of pages for this class + nr_pages <- ceiling(length(unique(metab_interest_perclass$HMDB_name)) / nr_plots_perpage) + for (page_nr in 1:nr_pages) { + total_nr_pages <- total_nr_pages + 1 + select_rows_start <- (nr_pat * nr_plots_perpage * (page_nr-1)) + 1 + select_rows_end <- nr_pat * nr_plots_perpage * page_nr + metab_onepage_pat <- metab_interest_perclass[select_rows_start:select_rows_end, ] + # same for controls + select_rows_start_contr <- (nr_contr * nr_plots_perpage * (page_nr-1)) + 1 + select_rows_end_contr <- nr_contr * nr_plots_perpage * page_nr + metab_onepage_pcontr <- metab_interest_contr_perclass[select_rows_start_contr:select_rows_end_contr, ] + # add controls + metab_onepage <- rbind(metab_onepage_pat, metab_onepage_pcontr) + # if a page has fewer than nr_plots_perpage plots, fill page with empty plots + NA_rows <- which(is.na(metab_onepage$HMDB_name)) + if (length(NA_rows) > 0) { + # repeat the patient and control variables + metab_onepage$variable[NA_rows] <- metab_onepage$variable[1:(nr_pat + nr_contr)] + # for HMDB name, substitute a number of spaces + for (row_nr in NA_rows) { + metab_onepage$HMDB_name[row_nr] <- paste0(rep("_", ceiling(row_nr/(nr_pat + nr_contr))), collapse = "") + } + metab_onepage$HMDB_name <- gsub("_", " ", metab_onepage$HMDB_name) + # leave the values at NA + } + # put data for one page into object with data for all pages + metab_perpage[[total_nr_pages]] <- metab_onepage + # create list of page headers + metab_category <- c(metab_category, paste(metab_class, page_nr, sep="_")) + } + } + # add page headers to list + names(metab_perpage) <- metab_category + + return(metab_perpage) + +} diff --git a/DIMS/AddOnFunctions/prepare_toplist.R b/DIMS/AddOnFunctions/prepare_toplist.R new file mode 100644 index 0000000..369f887 --- /dev/null +++ b/DIMS/AddOnFunctions/prepare_toplist.R @@ -0,0 +1,29 @@ +prepare_toplist <- function(pt_name, zscore_patients_copy) { + # set parameters for table + top_highest <- 20 + top_lowest <- 10 + + # extract data for patient of interest (pt_name) + pt_list <- zscore_patients_copy[ , c(1,2, which(colnames(zscore_patients_copy) == pt_name))] + # sort metabolites on Z-scores for this patient + pt_list_sort <- sort(pt_list[ , 3], index.return=TRUE) + pt_list_sorted <- pt_list[pt_list_sort$ix, ] + # determine top highest and lowest Z-scores for this patient + pt_list_sort <- sort(pt_list[ , 3], index.return=TRUE) + pt_list_low <- pt_list[pt_list_sort$ix[1:top_lowest], ] + pt_list_high <- pt_list[pt_list_sort$ix[length(pt_list_sort$ix):(length(pt_list_sort$ix)-top_highest+1)], ] + # round off Z-scores + pt_list_low[ , 3] <- round(as.numeric(pt_list_low[ , 3]), 2) + pt_list_high[ , 3] <- round(as.numeric(pt_list_high[ , 3]), 2) + # add lines for increased, decreased + extra_line1 <- c("Increased", "", "") + extra_line2 <- c("Decreased", "", "") + top_metab_pt <- rbind(extra_line1, pt_list_high, extra_line2, pt_list_low) + # remove row names + rownames(top_metab_pt) <- NULL + + # change column names for display + colnames(top_metab_pt) <- c("HMDB_ID", "Metabolite", "Z-score") + + return(top_metab_pt) +} diff --git a/DIMS/AddOnFunctions/remove.dupl.2.1.R b/DIMS/AddOnFunctions/remove.dupl.2.1.R new file mode 100644 index 0000000..89e307d --- /dev/null +++ b/DIMS/AddOnFunctions/remove.dupl.2.1.R @@ -0,0 +1,32 @@ +remove.dupl.2.1 <- function(peaklist) { + # peaklist = outlist.tot + # resultDir = "./results" + # scanmode = "positive" + + # peaklist=peaklist[1:100000,] + + collect=NULL + remove=NULL + + index = which(duplicated(peaklist[, "mzmed.pgrp"])) + + while (length(index) > 0){ + + peaklist_index = which(peaklist[, "mzmed.pgrp"] == peaklist[index[1], "mzmed.pgrp"]) + tmp=peaklist[peaklist_index[1],,drop=FALSE] + if (!is.na(peaklist[peaklist_index[1],"assi_HMDB"])) tmp[,"assi_HMDB"]=paste(peaklist[peaklist_index,"assi_HMDB"],collapse = ";") else tmp[,"assi_HMDB"]=NA + if (!is.na(peaklist[peaklist_index[1],"iso_HMDB"])) tmp[,"iso_HMDB"]=paste(peaklist[peaklist_index,"iso_HMDB"],collapse = ";") else tmp[,"iso_HMDB"]=NA + if (!is.na(peaklist[peaklist_index[1],"HMDB_code"])) tmp[,"HMDB_code"]=paste(peaklist[peaklist_index,"HMDB_code"],collapse = ";") else tmp[,"HMDB_code"]=NA + if (peaklist[peaklist_index[1],"assi_noise"]!="") tmp[,"assi_noise"]=paste(peaklist[peaklist_index,"assi_noise"],collapse = ";") else tmp[,"assi_noise"]="" + + collect = rbind(collect, tmp) + remove = c(remove, peaklist_index) + + index=index[-which(index==index[1])] + } + + peaklist = peaklist[-remove,] + peaklist = rbind(peaklist,collect) + + return(peaklist) +} diff --git a/DIMS/AddOnFunctions/remove.dupl.R b/DIMS/AddOnFunctions/remove.dupl.R new file mode 100644 index 0000000..227f207 --- /dev/null +++ b/DIMS/AddOnFunctions/remove.dupl.R @@ -0,0 +1,39 @@ +# remove duplicates, peak groups with mz within a few ppm +# ppmdef should be 2 times bigger than during identification!!! +remove.dupl <- function(peaklist, ppmdef=4, tolint=0.01) { + + # peaklist = outpgrlist + # ppmdef = 2 + # tolint = 0.01 + + int.cols = 7:ncol(peaklist) + + p <- 1 + while (p < nrow(peaklist)) { + mzref <- peaklist[p, "mzmed.pgrp"] + # print(mzref) + dist.ppm <- ppmdef * mzref / 1000000 + sel <- peaklist[ , "mzmed.pgrp"] > (mzref - dist.ppm) & peaklist[ , "mzmed.pgrp"] < (mzref + dist.ppm) + subset <- peaklist[sel, ] + if (nrow(subset) > 1) { + avi <- rep(1, max(int.cols)) + for (c in int.cols) { avi[c] <- max(subset[ ,c])/mean(subset[ ,c]) } + + # remove NaN + avi[which(is.nan(avi))] = 1 + + if (mean(avi) > (1-tolint) & mean(avi < (1+tolint))) { + peaklist <- peaklist[-which(sel), ] + newline <- subset[1, ] + newline[ , "mzmed.pgrp"] <- mean(subset[ , "mzmed.pgrp"]) + newline[ , "mzmin.pgrp"] <- min(subset[ , "mzmin.pgrp"]) + newline[ , "mzmax.pgrp"] <- max(subset[ , "mzmax.pgrp"]) + newline[ , int.cols] <- apply(subset[ , int.cols], 2, max) + # newline[ , "avg.int"] <- mean(as.numeric(newline[ , int.cols])) + peaklist <- rbind(peaklist, newline) + } + } + p <- p + 1 + } + peaklist +} diff --git a/DIMS/AddOnFunctions/removeFromRepl.pat.R b/DIMS/AddOnFunctions/removeFromRepl.pat.R new file mode 100644 index 0000000..ad5b1c8 --- /dev/null +++ b/DIMS/AddOnFunctions/removeFromRepl.pat.R @@ -0,0 +1,31 @@ +removeFromRepl.pat <- function(bad_samples, repl_pattern, nr_replicates) { + + tmp = repl_pattern + + removeFromGroup=NULL + + for (i in 1:length(tmp)){ + tmp2 = repl_pattern[[i]] + + remove=NULL + + for (j in 1:length(tmp2)){ + if (tmp2[j] %in% bad_samples){ + #cat(tmp2[j]) + #cat(paste("remove",tmp2[j])) + #cat(paste("remove i",i)) + #cat(paste("remove j",j)) + remove = c(remove, j) + } + } + + if (length(remove)==nr_replicates) removeFromGroup=c(removeFromGroup,i) + if (!is.null(remove)) repl_pattern[[i]]=repl_pattern[[i]][-remove] + } + + if (length(removeFromGroup)!=0) { + repl_pattern=repl_pattern[-removeFromGroup] + } + + return(list("pattern"=repl_pattern)) +} diff --git a/DIMS/AddOnFunctions/replaceZeros.R b/DIMS/AddOnFunctions/replaceZeros.R new file mode 100644 index 0000000..234d961 --- /dev/null +++ b/DIMS/AddOnFunctions/replaceZeros.R @@ -0,0 +1,90 @@ +replaceZeros <- function(outpgrlist, repl_pattern, scanmode, resol, outdir, thresh, ppm) { + # file="./results/grouping_rest/negative_1.RData" + # scanmode= "negative" + # scriptDir="./scripts" + # resol=140000 + # thresh=2000 + # outdir="./results" + + # control_label="C" + + # source(paste(scriptDir, "AddOnFunctions/sourceDir.R", sep="/")) + # sourceDir(paste(scriptDir, "AddOnFunctions", sep="/")) + + # dir.create(paste(outdir, "9-samplePeaksFilled", sep="/"), showWarnings = FALSE) + + # int.factor=1*10^5 # Number of x used to calc area under Gaussian (is not analytic) + # scale=2 # Initial value used to estimate scaling parameter + # width=1024 + # height=768 + + # load(paste0(outdir, "/repl.pattern.",scanmode, ".RData")) + + # batch_number = strsplit(basename(HMDB_part_file), ".",fixed = TRUE)[[1]][2] + # name = as.vector(unlist(strsplit(file, "/", fixed=TRUE))) + # name = name[length(name)] + # message(paste("File name: ", name)) + + # load samplePeaks + # load outpgrlist + # load(file) + + # # filter on at least signal in two control samples + # int.cols = grep(control_label, colnames(outpgrlist),fixed = TRUE) + # # barplot(as.numeric(outpgrlist[753, int.cols])) + # keep = NULL + # keep = apply(outpgrlist, 1, function(x) if (length(which(as.numeric(x[int.cols]) > 0)) > 1) keep=c(keep,TRUE) else keep=c(keep,FALSE)) + # outpgrlist = outpgrlist[keep,] + + # replace zeros + if (!is.null(outpgrlist)) { + print(dim(outpgrlist)) + print(colnames(outpgrlist)) + for (i in 1:length(names(repl_pattern))){ + print(names(repl_pattern)[i]) + samplePeaks=outpgrlist[,names(repl_pattern)[i]] + index=which(samplePeaks<=0) + if (!length(index)){ + next + } + for (j in 1:length(index)){ + area = generateGaussian(outpgrlist[index[j],"mzmed.pgrp"],thresh,resol,FALSE,scanmode,int.factor=1*10^5,1,1)$area + # for testing purposes, add a fixed random seed + # set.seed(123) + outpgrlist[index[j], names(repl_pattern)[i]] = rnorm(n=1, mean=area, sd=0.25*area) + } + } + + + # Identification + + # Add average column + outpgrlist = cbind(outpgrlist, "avg.int"=apply(outpgrlist[, 7:(ncol(outpgrlist)-4)], 1, mean)) + + if (scanmode=="negative"){ + label = "MNeg" + label2 = "Negative" + # take out multiple NaCl adducts + look4.add2 <- c("Cl", "Cl37", "For", "NaCl","KCl","H2PO4","HSO4","Na-H","K-H","H2O","I") # ,"min2H","min3H" + # HMDB_add_iso=HMDB_add_iso.Neg + } else { + label = "Mpos" + label2 = "Positive" + # take out NaCl adducts + look4.add2 <- c("Na", "K", "NaCl", "NH4","2Na-H","CH3OH","KCl","NaK-H") # ,"NaCl2","NaCl3","NaCl4","NaCl5") + # HMDB_add_iso=HMDB_add_iso.Pos + } + + # Identify noise peaks + noise.MZ <- read.table(file="/hpc/dbg_mz/tools/db/TheoreticalMZ_NegPos_incNaCl.txt", sep="\t", header=TRUE, quote = "") + noise.MZ <- noise.MZ[(noise.MZ[ , label] != 0), 1:4] + final.outlist.idpat2 = ident.hires.noise.HPC(outpgrlist, allAdducts, scanmode=label2, noise.MZ, look4=look4.add2, resol=resol, slope=0, incpt=0, ppm.fixed=ppm, ppm.iso.fixed=ppm) + tmp <- final.outlist.idpat2[ , c("assi", "theormz")] + colnames(tmp) <- c("assi_noise", "theormz_noise") + + final.outlist.idpat3 <- cbind(outpgrlist, tmp) + + return(final.outlist.idpat3) + # save(final.outlist.idpat3, file=paste("./", name, sep="")) + } +} diff --git a/DIMS/AddOnFunctions/replaceZeros_setseed.R b/DIMS/AddOnFunctions/replaceZeros_setseed.R new file mode 100644 index 0000000..4fafc9a --- /dev/null +++ b/DIMS/AddOnFunctions/replaceZeros_setseed.R @@ -0,0 +1,108 @@ +replaceZeros <- function(file,scanmode,resol,outdir,thresh,scriptDir,ppm){ + # file="./results/grouping_rest/negative_1.RData" + # scanmode= "negative" + # scriptDir="./scripts" + # resol=140000 + # thresh=2000 + # outdir="./results" + + control_label="C" + + source(paste(scriptDir, "AddOnFunctions/sourceDir.R", sep="/")) + sourceDir(paste(scriptDir, "AddOnFunctions", sep="/")) + + dir.create(paste(outdir, "9-samplePeaksFilled", sep="/"), showWarnings = FALSE) + + # int.factor=1*10^5 # Number of x used to calc area under Gaussian (is not analytic) + # scale=2 # Initial value used to estimate scaling parameter + # width=1024 + # height=768 + + # message(paste("file", file)) + # message(paste("scanmode", scanmode)) + # message(paste("resol", resol)) + # message(paste("outdir", outdir)) + # message(paste("thresh", thresh)) + # message(paste("scriptDir", scriptDir)) + + load(paste0(outdir, "/repl.pattern.",scanmode, ".RData")) + + name = as.vector(unlist(strsplit(file, "/", fixed=TRUE))) + name = name[length(name)] + # message(paste("File name: ", name)) + + # load samplePeaks + # load outpgrlist + load(file) + + # ################################################################################# + # # filter on at least signal in two control samples + # int.cols = grep(control_label, colnames(outpgrlist),fixed = TRUE) + # # barplot(as.numeric(outpgrlist[753, int.cols])) + # keep = NULL + # keep = apply(outpgrlist, 1, function(x) if (length(which(as.numeric(x[int.cols]) > 0)) > 1) keep=c(keep,TRUE) else keep=c(keep,FALSE)) + # outpgrlist = outpgrlist[keep,] + # ################################################################################# + + ################################################################################ + # For now only replace zeros + if (!is.null(outpgrlist)) { + for (i in 1:length(names(repl.pattern.filtered))){ + samplePeaks=outpgrlist[,names(repl.pattern.filtered)[i]] + index=which(samplePeaks<=0) + if (!length(index)){ + next + } + for (j in 1:length(index)){ + area = generateGaussian(outpgrlist[index[j],"mzmed.pgrp"],thresh,resol,FALSE,scanmode,int.factor=1*10^5,1,1)$area + # for testing purposes, add a fixed random seed + set.seed(123) + outpgrlist[index[j], names(repl.pattern.filtered)[i]] = rnorm(n=1, mean=area, sd=0.25*area) + } + } + + ################################################################################ + + + #################### identification ######################################################### + # load(paste(scriptDir, "../db/HMDB_add_iso_corrNaCl.RData", sep="/")) # E:\Metabolomics\LargeDataBase\Apr25_2016 + + # Add average column + outpgrlist = cbind(outpgrlist, "avg.int"=apply(outpgrlist[, 7:(ncol(outpgrlist)-4)], 1, mean)) + + if (scanmode=="negative"){ + label = "MNeg" + label2 = "Negative" + # take out multiple NaCl adducts + look4.add2 <- c("Cl", "Cl37", "For", "NaCl","KCl","H2PO4","HSO4","Na-H","K-H","H2O","I") # ,"min2H","min3H" + # HMDB_add_iso=HMDB_add_iso.Neg + } else { + label = "Mpos" + label2 = "Positive" + # take out NaCl adducts + look4.add2 <- c("Na", "K", "NaCl", "NH4","2Na-H","CH3OH","KCl","NaK-H") # ,"NaCl2","NaCl3","NaCl4","NaCl5") + # HMDB_add_iso=HMDB_add_iso.Pos + } + + # # Identification using large database + # final.outlist.idpat = iden.code(outpgrlist, HMDB_add_iso, ppm=2, label) + # message(paste(sum(final.outlist.idpat[ , "assi_HMDB"] != ""), "assigned peakgroups")) + # message(paste(sum(final.outlist.idpat[ , "iso_HMDB"] != ""), "assigned isomeres")) + + # Identify noise peaks + noise.MZ <- read.table(file="/hpc/dbg_mz/tools/db/TheoreticalMZ_NegPos_incNaCl.txt", sep="\t", header=TRUE, quote = "") + noise.MZ <- noise.MZ[(noise.MZ[ , label] != 0), 1:4] + + # Replace "Negative" by "negative" in ident.hires.noise + final.outlist.idpat2 = ident.hires.noise.HPC(outpgrlist, allAdducts, scanmode=label2, noise.MZ, look4=look4.add2, resol=resol, slope=0, incpt=0, ppm.fixed=ppm, ppm.iso.fixed=ppm) + # message(paste(sum(final.outlist.idpat2[ , "assi"] != ""), "assigned noise peaks")) + tmp <- final.outlist.idpat2[ , c("assi", "theormz")] + colnames(tmp) <- c("assi_noise", "theormz_noise") + + final.outlist.idpat3 <- cbind(outpgrlist, tmp) + ############################################################################################# + + # message(paste("File saved: ", paste(outdir, "/samplePeaksFilled/", name, sep=""))) + save(final.outlist.idpat3, file=paste(outdir, "/9-samplePeaksFilled/", name, sep="")) + } +} diff --git a/DIMS/AddOnFunctions/run.vbs b/DIMS/AddOnFunctions/run.vbs new file mode 100644 index 0000000..640116a --- /dev/null +++ b/DIMS/AddOnFunctions/run.vbs @@ -0,0 +1,22 @@ +Option Explicit +On Error Resume Next +RunExcelMacro + +Sub RunExcelMacro() + +Dim xlApp +Dim xlBook +Dim xlBook_persenal + +Set xlApp = CreateObject("Excel.Application") +xlApp.DisplayAlerts = FALSE +Set xlBook_persenal = xlApp.Workbooks.Open("C:\Users\awillem8\AppData\Roaming\Microsoft\Excel\XLSTART\PERSONAL.XLSB", 0, True) +Set xlBook = xlApp.Workbooks.Open("C:\Users\mkerkho7\testmap\xls\P181_Aberant_HMDB.xlsx", 0, True) +xlApp.Run "PERSONAL.XLSB!FixAndResize" +xlBook.SaveAs "C:\Users\mkerkho7\testmap\results\P181_Aberant_HMDB.xlsx" +xlBook.Close False +xlApp.Quit +Set xlBook = Nothing +Set xlBook_persenal = Nothing +Set xlApp = Nothing +End Sub diff --git a/DIMS/AddOnFunctions/runVBAMacro.R b/DIMS/AddOnFunctions/runVBAMacro.R new file mode 100644 index 0000000..3836b2f --- /dev/null +++ b/DIMS/AddOnFunctions/runVBAMacro.R @@ -0,0 +1,48 @@ +runVBAMacro <- function(dir, dir2, vb_script) { + +# dir="E:\\Metabolomics\\Lynne_BSP-2015-08-05\\results\\xls\\" +# dir2="E:\\Metabolomics\\Lynne_BSP-2015-08-05\\results\\xls_fixed\\" + +dir.create(dir2) +files=list.files(dir) + +script_1 = paste("Option Explicit +On Error Resume Next +RunExcelMacro + +Sub RunExcelMacro() + +Dim xlApp +Dim xlBook +Dim xlBook_persenal + +Set xlApp = CreateObject(\"Excel.Application\") +xlApp.DisplayAlerts = FALSE +Set xlBook_persenal = xlApp.Workbooks.Open(\"C:\\Users\\awillem8\\AppData\\Roaming\\Microsoft\\Excel\\XLSTART\\PERSONAL.XLSB\", 0, True) +Set xlBook = xlApp.Workbooks.Open(\"", dir, sep="") + +script_2 = "\", 0, True) +xlApp.Run \"PERSONAL.XLSB!FixAndResize\" +xlBook.SaveAs \"" + +script_3 = "\" +xlBook.Close False +xlApp.Quit +Set xlBook = Nothing +Set xlBook_persenal = Nothing +Set xlApp = Nothing +End Sub" + +for (i in 1:length(files)){ + script = paste(script_1, files[i], script_2, dir2, files[i], script_3, sep="") # dir2, files[i], + + message(script) + + fileConn = file("./src/run.vbs") + writeLines(script, fileConn) + close(fileConn) + + system(vb_script) +} +} + diff --git a/DIMS/AddOnFunctions/searchMZRange.R b/DIMS/AddOnFunctions/searchMZRange.R new file mode 100644 index 0000000..43a7171 --- /dev/null +++ b/DIMS/AddOnFunctions/searchMZRange.R @@ -0,0 +1,187 @@ +searchMZRange <- function(range,values,int.factor,scale,resol,outdir,sampname,scanmode,plot,width,height,thresh){ + # range=sub_range + + end=NULL + index=as.vector(which(range!=0)) + + # bad infusion + if (length(index)==0) return(values) + + start=index[1] + subRangeLength = 15 + + for (i in 1:length(index)){ + # for (i in 1:129000){ + + if (i 1){ + + end=index[i] + + # start=395626 + # end=395640 + # i=25824 + + # gaan met de banaan + # 128836 + # start 1272853 + # mz start 316.83302 + # end 1272870 + # mz end 316.84269 + + x = as.numeric(names(range)[c(start:end)]) + y = as.vector(range[c(start:end)]) + # # Trim zeros + # x = as.vector(trimZeros(x,y)[[1]]) + # y = as.vector(trimZeros(x,y)[[2]]) + + if (length(y)!=0) { + + # check if intensity above thresh + if (max(y) < thresh | is.nan(max(y))) { + start=index[i+1] + next + } + + # cat("gaan met de banaan") + # cat(i) + # cat(paste("start", start, sep=" ")) + # cat(paste("mz start", x[1], sep=" ")) + # cat(paste("end", end, sep=" ")) + # cat(paste("mz end", x[length(x)], sep=" ")) + + if (length(y)>subRangeLength) { + + y[which(y3) { + # Check only zeros + if (sum(y)==0) next + + rval = fitGaussianInit(x,y,int.factor,scale,resol,outdir,sampname, scanmode, plot,width,height,i,start,end) + + if (rval$qual[1]==1) { + + rval = generateGaussian(x,y,resol,plot,scanmode,int.factor, width, height) + + values$mean = c(values$mean, rval$mean) + values$area = c(values$area, rval$area) + values$nr = c(values$nr, sampname) + values$min = c(values$min, rval$min) + values$max = c(values$max, rval$max) + values$qual = c(values$qual, 0) + + values$spikes = values$spikes + 1 + + } else { + for (j in 1:length(rval$mean)){ + values$mean = c(values$mean, rval$mean[j]) + values$area = c(values$area, rval$area[j]) + values$nr = c(values$nr, sampname) + values$min = c(values$min, rval$min[1]) + values$max = c(values$max, rval$max[1]) + values$qual = c(values$qual, rval$qual[1]) + } + } + + } else { + + rval = generateGaussian(x,y,resol,plot,scanmode,int.factor, width, height) + + values$mean = c(values$mean, rval$mean) + values$area = c(values$area, rval$area) + values$nr = c(values$nr, sampname) + values$min = c(values$min, rval$min) + values$max = c(values$max, rval$max) + values$qual = c(values$qual, 0) + + values$spikes = values$spikes + 1 + } + } + start=index[i+1] + } + } + + # last little range + end = index[length(index)] + x = as.numeric(names(range)[c(start:end)]) + y = as.vector(range[c(start:end)]) + + # x = as.vector(trimZeros(x,y)[[1]]) + # y = as.vector(trimZeros(x,y)[[2]]) + + if (length(y)!=0) { + + # check if intensity above thresh + if (max(y) < thresh | is.nan(max(y))) { + #start=index[i+1] + # do nothing!! + } else { + + # cat("gaan met de banaan") + # cat(paste("start", start, sep=" ")) + # cat(paste("mz start", x[1], sep=" ")) + # cat(paste("end", end, sep=" ")) + # cat(paste("mz end", x[length(x)], sep=" ")) + + if (length(y)>subRangeLength) { + + y[which(y3) { + + # Check only zeros + if (sum(y)==0) next + + rval = fitGaussianInit(x,y,int.factor,scale,resol,outdir,sampname,scanmode,plot,width,height,i,start,end) + + if (rval$qual[1]==1) { + #cat("Quality = 1!!!") + rval = generateGaussian(x,y,resol,plot,scanmode,int.factor, width, height) + + values$mean = c(values$mean, rval$mean) + values$area = c(values$area, rval$area) + values$nr = c(values$nr, sampname) + values$min = c(values$min, rval$min) + values$max = c(values$max, rval$max) + values$qual = c(values$qual, 0) + + values$spikes = values$spikes + 1 + + } else { + for (j in 1:length(rval$mean)){ + values$mean = c(values$mean, rval$mean[j]) + values$area = c(values$area, rval$area[j]) + values$nr = c(values$nr, sampname) + values$min = c(values$min, rval$min[1]) + values$max = c(values$max, rval$max[1]) + values$qual = c(values$qual, rval$qual[1]) + } + } + } else { + rval = generateGaussian(x,y,resol,plot,scanmode,int.factor, width, height) + + values$mean = c(values$mean, rval$mean) + values$area = c(values$area, rval$area) + values$nr = c(values$nr, sampname) + values$min = c(values$min, rval$min) + values$max = c(values$max, rval$max) + values$qual = c(values$qual, 0) + + values$spikes = values$spikes + 1 + } + } + } + + return(values) + #return(list("mean"=retVal$mean, "area"=retVal$area, "qual"=retVal$qual, "min"=retVal$min, "max"=retVal$max, "nr"=sample.nr, "spikes"=spikes)) +} diff --git a/DIMS/AddOnFunctions/sourceDir.R b/DIMS/AddOnFunctions/sourceDir.R new file mode 100644 index 0000000..abf4c9e --- /dev/null +++ b/DIMS/AddOnFunctions/sourceDir.R @@ -0,0 +1,8 @@ +# source add on functions +sourceDir <- function(path, trace = TRUE, ...) { + for (nm in list.files(path, pattern = "[.][RrSsQq]$")) { + #if(trace) cat(nm,":") + source(file.path(path, nm), ...) + #if(trace) cat("\n") + } +} diff --git a/DIMS/AddOnFunctions/statistics_z.R b/DIMS/AddOnFunctions/statistics_z.R new file mode 100644 index 0000000..0a92d3d --- /dev/null +++ b/DIMS/AddOnFunctions/statistics_z.R @@ -0,0 +1,69 @@ +statistics_z <- function(peaklist, sortCol, adducts){ + # peaklist=as.data.frame(outlist.adducts.HMDB) + # plotdir="./results/plots/adducts" + # filename="./results/allpgrps_stats.txt" + # adducts=TRUE + + # peaklist=outlist.tot + # sortCol=NULL + # adducts=FALSE + + case_label = "P" + control_label = "C" + startcol = dim(peaklist)[2]+3 + + # calculate mean and sd for Control group + ctrl.cols <- grep(control_label, colnames(peaklist),fixed = TRUE) # 5:41 + int.cols <- c(grep(control_label, colnames(peaklist),fixed = TRUE), grep(case_label, colnames(peaklist),fixed = TRUE)) + peaklist[,int.cols][peaklist[,int.cols]==0] = NA + + # tmp = data.matrix(peaklist[ , ctrl.cols], rownames.force = TRUE) + tmp = peaklist[ , ctrl.cols, drop=FALSE] + + peaklist$avg.ctrls = apply(tmp, 1, function(x) mean(as.numeric(x),na.rm = TRUE)) + peaklist$sd.ctrls = apply(tmp, 1, function(x) sd(as.numeric(x),na.rm = TRUE)) + + cnames.z = NULL + + for (i in int.cols) { + # message(i) + cname = colnames(peaklist)[i] + cnames.z = c(cnames.z, paste(cname, "Zscore", sep="_")) + zscores.1col = (as.numeric(as.vector(unlist(peaklist[ , i]))) - peaklist$avg.ctrls) / peaklist$sd.ctrls + peaklist = cbind(peaklist, zscores.1col) + } + + colnames(peaklist)[startcol:ncol(peaklist)] = cnames.z + + z.cols = grep("Zscore", colnames(peaklist),fixed = TRUE) + + if (!adducts){ + if ((dim(peaklist[, z.cols])[2]+6)!=(startcol-1)){ + ppmdev=array(1:dim(peaklist)[1], dim=c(dim(peaklist)[1])) + + # calculate ppm deviation + for (i in 1:dim(peaklist)[1]){ + if (!is.na(peaklist$theormz_HMDB[i]) & !is.null(peaklist$theormz_HMDB[i]) & (peaklist$theormz_HMDB[i]!="")){ + ppmdev[i] = 10^6*(as.numeric(as.vector(peaklist$mzmed.pgrp[i]))-as.numeric(as.vector(peaklist$theormz_HMDB[i])))/as.numeric(as.vector(peaklist$theormz_HMDB[i])) + } else { + ppmdev[i]=NA + } + } + + peaklist = cbind(peaklist[, 1:6], ppmdev=ppmdev, peaklist[ , 7:ncol(peaklist)]) + } + } + + #peaklist = peaklist[order(peaklist[,sortCol]),] + + # # Order on average Z-score + # tmp = peaklist[,grep("Zscore", colnames(peaklist))] + # tmp.p = tmp[,grep("P", colnames(tmp)),drop=FALSE] + # tmp.p.avg = apply(tmp.p, 1, mean) + # + # peaklist = cbind(peaklist, "avg.z.score"=tmp.p.avg) + # peaklist = peaklist[order(abs(tmp.p.avg), decreasing = TRUE),] + + return(peaklist) + +} diff --git a/DIMS/AddOnFunctions/sumCurves.R b/DIMS/AddOnFunctions/sumCurves.R new file mode 100644 index 0000000..bcc1485 --- /dev/null +++ b/DIMS/AddOnFunctions/sumCurves.R @@ -0,0 +1,39 @@ +sumCurves <- function(mean1, mean2, scale1, scale2, sigma1, sigma2, x2, x, resol, plot) { + # mean1=mean[i-1] + # mean2=mean[i] + # scale1=scale[i-1] + # scale2=scale[i] + # sigma1=sigma[i-1] + # sigma2=sigma[i] + + # message("=============================================================> sum 2 curves!") + + sumFit=(scale1*dnorm(x2,mean1,sigma1))+(scale2*dnorm(x2,mean2,sigma2)) + if (plot) lines(x2,sumFit,col="brown") + + #mean1plus2 = mean(c(mean1,mean2)) + mean1plus2 = weighted.mean(c(mean1,mean2),c(max(scale1*dnorm(x2,mean1,sigma1)),max(scale2*dnorm(x2,mean2,sigma2)))) + + if (plot) abline(v = mean1plus2, col="brown") + fwhm = getFwhm(mean1plus2, resol) + half_max = max(sumFit)*0.5 + if (plot) lines(c(mean1plus2 - 0.5*fwhm, mean1plus2 + 0.5*fwhm),c(half_max,half_max),col="orange") + + # sumFit=(scale1*dnorm(x,mean1,sigma1))+(scale2*dnorm(x,mean2,sigma2)) + # fq=abs(sum(y) - sum(sumFit))/sum(y) + + # h2=c(paste("mean =", mean1plus2, sep=" "), + # paste("fq =", fq, sep=" ")) + # + # legend("topright", legend=h2) + + # I assume that the sum of the distributions if also normal, which is not + #area = sum(scale1*dnorm(x2,mean1,sigma1))+sum(scale2*dnorm(x2,mean2,sigma2)) + #area = max(scale1*dnorm(x2,mean1,sigma1))+max(scale2*dnorm(x2,mean2,sigma2)) + area = max(sumFit) + scale = scale1 + scale2 + sigma = (fwhm/2)*0.85 + + return(list("mean"= mean1plus2,"area"=area, "scale"=scale, "sigma"=sigma)) # "qual"=fq + +} diff --git a/DIMS/AddOnFunctions/trimZeros.R b/DIMS/AddOnFunctions/trimZeros.R new file mode 100644 index 0000000..0fc5ea3 --- /dev/null +++ b/DIMS/AddOnFunctions/trimZeros.R @@ -0,0 +1,8 @@ +trimZeros <- function(x, y) { + tmp = which(y==0) + if (length(tmp)!=0){ + y = y[-tmp] + x = x[-tmp] + } + return(list(x,y)) +} diff --git a/DIMS/AssignToBins.R b/DIMS/AssignToBins.R new file mode 100644 index 0000000..4ca7fce --- /dev/null +++ b/DIMS/AssignToBins.R @@ -0,0 +1,128 @@ +## adapted from 2-DIMS.R + +# load required packages +suppressPackageStartupMessages(library("xcms")) + +# define parameters +cmd_args <- commandArgs(trailingOnly = TRUE) + +mzml_filepath <- cmd_args[1] +breaks_filepath <- cmd_args[2] +resol <- as.numeric(cmd_args[3]) +trim <- 0.1 +dims_thresh <- 100 + +# load breaks_fwhm +load(breaks_filepath) + +# get sample name +sample_name <- sub("\\..*$", "", basename(mzml_filepath)) + +options(digits = 16) + +# Initialize +pos_results <- NULL +neg_results <- NULL + +# read in the data for 1 sample +raw_data <- suppressMessages(xcms::xcmsRaw(mzml_filepath)) + +# for TIC plots: prepare txt files with data for plots +tic_intensity_persample <- cbind(round(raw_data@scantime, 2), raw_data@tic) +colnames(tic_intensity_persample) <- c("retention_time", "tic_intensity") +write.table(tic_intensity_persample, file = paste0(sample_name, "_TIC.txt")) + +# Create empty placeholders for later use +bins <- rep(0, length(breaks_fwhm) - 1) +pos_bins <- bins +neg_bins <- bins + +# Generate a matrix +raw_data_matrix <- xcms::rawMat(raw_data) + +# Get time values for positive and negative scans +pos_times <- raw_data@scantime[raw_data@polarity == "positive"] +neg_times <- raw_data@scantime[raw_data@polarity == "negative"] +# Select scans between trim_left and trim_right +pos_times <- pos_times[pos_times > trim_left & pos_times < trim_right] +neg_times <- neg_times[neg_times > trim_left & neg_times < trim_right] + +# Generate an index with which to select values for each mode +pos_index <- which(raw_data_matrix[, "time"] %in% pos_times) +neg_index <- which(raw_data_matrix[, "time"] %in% neg_times) +# Separate each mode into its own matrix +pos_raw_data_matrix <- raw_data_matrix[pos_index, ] +neg_raw_data_matrix <- raw_data_matrix[neg_index, ] + +# Get index for binning intensity values +bin_indices_pos <- cut( + pos_raw_data_matrix[, "mz"], + breaks_fwhm, + include.lowest = TRUE, + right = TRUE, + labels = FALSE +) +bin_indices_neg <- cut( + neg_raw_data_matrix[, "mz"], + breaks_fwhm, + include.lowest = TRUE, + right = TRUE, + labels = FALSE +) + +# Get the list of intensity values for each bin, and add the +# intensity values which are in the same bin +if (nrow(pos_raw_data_matrix) > 0) { + # set NA in intensities to zero + pos_raw_data_matrix[is.na(pos_raw_data_matrix[, "intensity"]), "intensity"] <- 0 + # aggregate intensities, calculate mean, use only values above dims_thresh + aggr_int_pos <- stats::aggregate(pos_raw_data_matrix[, "intensity"], + list(bin_indices_pos), + FUN = function(x) { mean(x[which(x > dims_thresh)]) }) + # set NA to zero in second column + aggr_int_pos[is.na(aggr_int_pos[, 2]), 2] <- 0 + pos_bins[aggr_int_pos[, 1]] <- aggr_int_pos[, 2] +} +if (nrow(neg_raw_data_matrix) > 0) { + # set NA in intensities to zero + neg_raw_data_matrix[is.na(neg_raw_data_matrix[, "intensity"]), "intensity"] <- 0 + # aggregate intensities, calculate mean, use only values above dims_thresh + aggr_int_neg <- stats::aggregate(neg_raw_data_matrix[, "intensity"], + list(bin_indices_neg), + FUN = function(x) { mean(x[which(x > dims_thresh)]) }) + # set NA to zero in second column + aggr_int_neg[is.na(aggr_int_neg[, 2]), 2] <- 0 + neg_bins[aggr_int_neg[, 1]] <- aggr_int_neg[, 2] +} + +# Zero any values that are below the threshold +pos_bins[pos_bins < dims_thresh] <- 0 +neg_bins[neg_bins < dims_thresh] <- 0 + +pos_results <- cbind(pos_results, pos_bins) +neg_results <- cbind(neg_results, neg_bins) + +# transpose +pos_results_transpose <- t(pos_results) +neg_results_transpose <- t(neg_results) + +# Add file names as row names +rownames(pos_results_transpose) <- sample_name +rownames(neg_results_transpose) <- sample_name + +# delete the last value of breaks_fwhm_avg to match dimensions of pos_results and neg_results +breaks_fwhm_avg_minuslast <- breaks_fwhm_avg[-length(breaks_fwhm_avg)] +# Format as string and show precision of float to 5 digits +breaks_fwhm_avg_minuslast <- sprintf("%.5f", breaks_fwhm_avg_minuslast) + +# Use this as the column names +colnames(pos_results_transpose) <- breaks_fwhm_avg_minuslast +colnames(neg_results_transpose) <- breaks_fwhm_avg_minuslast + +# transpose back +pos_results_final <- t(pos_results_transpose) +neg_results_final <- t(neg_results_transpose) + +peak_list <- list("pos" = pos_results_final, "neg" = neg_results_final, "breaksFwhm" = breaks_fwhm) + +save(peak_list, file = paste0(sample_name, ".RData")) diff --git a/DIMS/AssignToBins.nf b/DIMS/AssignToBins.nf new file mode 100644 index 0000000..d0a7098 --- /dev/null +++ b/DIMS/AssignToBins.nf @@ -0,0 +1,20 @@ +process AssignToBins { + tag "DIMS AssignToBins ${file_id}" + label 'AssignToBins' + container = 'docker://umcugenbioinf/dims:1.3' + shell = ['/bin/bash', '-euo', 'pipefail'] + + input: + tuple(val(file_id), path(mzML_file), path(breaks_file)) + + output: + path("${file_id}.RData"), emit: rdata_file + path("${file_id}_TIC.txt"), emit: tic_txt_file + + script: + """ + Rscript ${baseDir}/CustomModules/DIMS/AssignToBins.R $mzML_file $breaks_file $params.resolution + """ +} + + diff --git a/DIMS/AverageTechReplicates.R b/DIMS/AverageTechReplicates.R new file mode 100644 index 0000000..6a773f5 --- /dev/null +++ b/DIMS/AverageTechReplicates.R @@ -0,0 +1,190 @@ +# adapted from 3-AverageTechReplicates.R + +# load packages +library("ggplot2") +library("gridExtra") + +# define parameters +cmd_args <- commandArgs(trailingOnly = TRUE) + +init_file <- cmd_args[1] +nr_replicates <- as.numeric(cmd_args[2]) +run_name <- cmd_args[3] +dims_matrix <- cmd_args[4] +highest_mz_file <- cmd_args[5] +highest_mz <- get(load(highest_mz_file)) +thresh2remove <- 1000000000 +dims_thresh <- 100 + +remove_from_repl_pattern <- function(bad_samples, repl_pattern, nr_replicates) { + # collect list of samples to remove from replication pattern + remove_from_group <- NULL + for (sample_nr in 1:length(repl_pattern)){ + repl_pattern_1sample <- repl_pattern[[sample_nr]] + remove <- NULL + for (file_nr in 1:length(repl_pattern_1sample)) { + if (repl_pattern_1sample[file_nr] %in% bad_samples) { + remove <- c(remove, file_nr) + } + } + if (length(remove) == nr_replicates) { + remove_from_group <- c(remove_from_group, sample_nr) + } + if (!is.null(remove)) { + repl_pattern[[sample_nr]] <- repl_pattern[[sample_nr]][-remove] + } + } + if (length(remove_from_group) != 0) { + repl_pattern <- repl_pattern[-remove_from_group] + } + return(list("pattern" = repl_pattern)) +} + +# get repl_pattern +load(init_file) + +# lower the threshold below which a sample will be removed for DBS and for high m/z +if (dims_matrix == "DBS") { + thresh2remove <- 500000000 +} +if (highest_mz > 700) { + thresh2remove <- 1000000 +} + +# remove technical replicates which are below the threshold +remove_neg <- NULL +remove_pos <- NULL +cat("Pklist sum threshold to remove technical replicate:", thresh2remove, "\n") +for (sample_nr in 1:length(repl_pattern)) { + tech_reps <- as.vector(unlist(repl_pattern[sample_nr])) + tech_reps_array_pos <- NULL + tech_reps_array_neg <- NULL + sum_neg <- 0 + sum_pos <- 0 + nr_pos <- 0 + nr_neg <- 0 + for (file_nr in 1:length(tech_reps)) { + load(paste(tech_reps[file_nr], ".RData", sep = "")) + cat("\n\nParsing", tech_reps[file_nr]) + # negative scanmode + cat("\n\tNegative peak_list sum", sum(peak_list$neg[, 1])) + if (sum(peak_list$neg[, 1]) < thresh2remove) { + cat(" ... Removed") + remove_neg <- c(remove_neg, tech_reps[file_nr]) + } else { + nr_neg <- nr_neg + 1 + sum_neg <- sum_neg + peak_list$neg + } + tech_reps_array_neg <- cbind(tech_reps_array_neg, peak_list$neg) + # positive scanmode + cat("\n\tPositive peak_list sum", sum(peak_list$pos[, 1])) + if (sum(peak_list$pos[, 1]) < thresh2remove) { + cat(" ... Removed") + remove_pos <- c(remove_pos, tech_reps[file_nr]) + } else { + nr_pos <- nr_pos + 1 + sum_pos <- sum_pos + peak_list$pos + } + tech_reps_array_pos <- cbind(tech_reps_array_pos, peak_list$pos) + } + # save to file + if (nr_neg != 0) { + sum_neg[, 1] <- sum_neg[, 1] / nr_neg + colnames(sum_neg) <- names(repl_pattern)[sample_nr] + save(sum_neg, file = paste0(names(repl_pattern)[sample_nr], "_neg_avg.RData")) + } + if (nr_pos != 0) { + sum_pos[, 1] <- sum_pos[, 1] / nr_pos + colnames(sum_pos) <- names(repl_pattern)[sample_nr] + save(sum_pos, file = paste0(names(repl_pattern)[sample_nr], "_pos_avg.RData")) + } +} + +pattern_list <- remove_from_repl_pattern(remove_neg, repl_pattern, nr_replicates) +repl_pattern_filtered <- pattern_list$pattern +save(repl_pattern_filtered, file = "negative_repl_pattern.RData") +write.table( + remove_neg, + file = "miss_infusions_negative.txt", + row.names = FALSE, + col.names = FALSE, + sep = "\t" +) + +pattern_list <- remove_from_repl_pattern(remove_pos, repl_pattern, nr_replicates) +repl_pattern_filtered <- pattern_list$pattern +save(repl_pattern_filtered, file = "positive_repl_pattern.RData") +write.table( + remove_pos, + file = "miss_infusions_positive.txt", + row.names = FALSE, + col.names = FALSE, + sep = "\t" +) + +## generate TIC plots +# get all txt files +tic_files <- list.files("./", full.names = TRUE, pattern = "*TIC.txt") +all_samps <- sub("_TIC\\..*$", "", basename(tic_files)) + +# determine maximum intensity +highest_tic_max <- 0 +for (file in tic_files) { + tic <- read.table(file) + this_tic_max <- max(tic$tic_intensity) + if (this_tic_max > highest_tic_max) { + highest_tic_max <- this_tic_max + max_sample <- sub("_TIC\\..*$", "", basename(file)) + } +} + +# create a list with information for all TIC plots +tic_plot_list <- list() +plot_nr <- 0 +for (sample_nr in c(1:length(repl_pattern))) { + tech_reps <- as.vector(unlist(repl_pattern[sample_nr])) + sample_name <- names(repl_pattern)[sample_nr] + for (file_nr in 1:length(tech_reps)) { + plot_nr <- plot_nr + 1 + # repl1_nr <- read.table(paste(paste(outdir, "2-pklist/", sep = "/"), tech_reps[file_nr], "_TIC.txt", sep = "")) + repl1_nr <- read.table(paste0(tech_reps[file_nr], "_TIC.txt")) + bad_color_pos <- tech_reps[file_nr] %in% remove_pos[[1]] + bad_color_neg <- tech_reps[file_nr] %in% remove_neg[[1]] + if (bad_color_neg & bad_color_pos) { + plot_color <- "#F8766D" + } else if (bad_color_pos) { + plot_color <- "#ED8141" + } else if (bad_color_neg) { + plot_color <- "#BF80FF" + } else { + plot_color <- "white" + } + tic_plot <- ggplot(repl1_nr, aes(retention_time, tic_intensity)) + + geom_line(linewidth = 0.3) + + geom_hline(yintercept = highest_tic_max, col = "grey", linetype = 2, linewidth = 0.3) + + labs(x = "t (s)", y = "tic_intensity", title = paste0(tech_reps[file_nr], " || ", sample_name)) + + theme(plot.background = element_rect(fill = plot_color), + axis.text = element_text(size = 4), + axis.title = element_text(size = 4), + plot.title = element_text(size = 6)) + tic_plot_list[[plot_nr]] <- tic_plot + } +} + +# create a layout matrix dependent on number of replicates +layout <- matrix(1:(10 * nr_replicates), 10, nr_replicates, TRUE) +# put TIC plots in matrix +tic_plot_pdf <- marrangeGrob( + grobs = tic_plot_list, + nrow = 10, ncol = nr_replicates, + layout_matrix = layout, + top = quote(paste( + "TICs of run", run_name, + " \n colors: red = both modes misinjection, orange = pos mode misinjection, purple = neg mode misinjection \n ", + g, "/", npages + )) +) + +# save to file +ggsave(filename = paste0(run_name, "_TICplots.pdf"), + tic_plot_pdf, width = 21, height = 29.7, units = "cm") diff --git a/DIMS/AverageTechReplicates.nf b/DIMS/AverageTechReplicates.nf new file mode 100644 index 0000000..5387e61 --- /dev/null +++ b/DIMS/AverageTechReplicates.nf @@ -0,0 +1,33 @@ +process AverageTechReplicates { + tag "DIMS AverageTechReplicates" + label 'AverageTechReplicates' + container = 'docker://umcugenbioinf/dims:1.3' + shell = ['/bin/bash', '-euo', 'pipefail'] + + input: + path(rdata_file) + path(tic_txt_files) + path(init_file) + val(nr_replicates) + val(analysis_id) + val(matrix) + path(highest_mz_file) + + output: + path('*_repl_pattern.RData'), emit: pattern_files + path('*_avg.RData'), emit: binned_files + path('miss_infusions_negative.txt') + path('miss_infusions_positive.txt') + path('*_TICplots.pdf') + + script: + """ + Rscript ${baseDir}/CustomModules/DIMS/AverageTechReplicates.R $init_file \ + $params.nr_replicates \ + $analysis_id \ + $matrix \ + $highest_mz_file + """ +} + + diff --git a/DIMS/CollectFilled.R b/DIMS/CollectFilled.R new file mode 100755 index 0000000..a0ad358 --- /dev/null +++ b/DIMS/CollectFilled.R @@ -0,0 +1,71 @@ +## adapted from 10-collectSamplesFilled.R + +# define parameters +cmd_args <- commandArgs(trailingOnly = TRUE) + +scripts_dir <- cmd_args[1] +ppm <- as.numeric(cmd_args[2]) +z_score <- as.numeric(cmd_args[3]) + +source(paste0(scripts_dir, "merge_duplicate_rows.R")) +source(paste0(scripts_dir, "calculate_zscores.R")) + +# for each scan mode, collect all filled peak group lists +scanmodes <- c("positive", "negative") +for (scanmode in scanmodes) { + # get list of files + filled_files <- list.files("./", full.names = TRUE, pattern = paste0(scanmode, "_identified_filled")) + # load files and combine into one object + outlist_total <- NULL + for (file_nr in 1:length(filled_files)) { + peakgrouplist_filled <- get(load(filled_files[file_nr])) + outlist_total <- rbind(outlist_total, peakgrouplist_filled) + } + # remove duplicates; peak groups with exactly the same m/z + outlist_total <- merge_duplicate_rows(outlist_total) + # sort on mass + outlist_total <- outlist_total[order(outlist_total[, "mzmed.pgrp"]), ] + # load replication pattern + pattern_file <- paste0(scanmode, "_repl_pattern.RData") + repl_pattern <- get(load(pattern_file)) + # calculate Z-scores + if (z_score == 1) { + outlist_stats <- calculate_zscores(outlist_total, adducts = FALSE) + nr_removed_samples <- length(which(repl_pattern[] == "character(0)")) + order_index_int <- order(colnames(outlist_stats)[8:(length(repl_pattern) - nr_removed_samples + 7)]) + outlist_stats_more <- cbind( + outlist_stats[, 1:7], + outlist_stats[, (length(repl_pattern) - nr_removed_samples + 8):(length(repl_pattern) - nr_removed_samples + 8 + 6)], + outlist_stats[, 8:(length(repl_pattern) - nr_removed_samples + 7)][order_index_int], + outlist_stats[, (length(repl_pattern) - nr_removed_samples + 5 + 10):ncol(outlist_stats)] + ) + # sort Z-score columns and append to peak group list + tmp_index <- grep("_Zscore", colnames(outlist_stats_more), fixed = TRUE) + tmp_index_order <- order(colnames(outlist_stats_more[, tmp_index])) + tmp <- outlist_stats_more[, tmp_index[tmp_index_order]] + outlist_stats_more <- outlist_stats_more[, -tmp_index] + outlist_stats_more <- cbind(outlist_stats_more, tmp) + outlist_total <- outlist_stats_more + } + + # make a copy of the outlist + outlist_ident <- outlist_total + # select identified peak groups if ppm deviation is within limits + if (z_score == 1) { + outlist_ident$ppmdev <- as.numeric(outlist_ident$ppmdev) + outlist_ident <- outlist_ident[which(outlist_ident["ppmdev"] >= -ppm & outlist_ident["ppmdev"] <= ppm), ] + } + # take care of NAs in theormz_noise + outlist_ident$theormz_noise[which(is.na(outlist_ident$theormz_noise))] <- 0 + outlist_ident$theormz_noise <- as.numeric(outlist_ident$theormz_noise) + outlist_ident$theormz_noise[which(is.na(outlist_ident$theormz_noise))] <- 0 + outlist_ident$theormz_noise <- as.numeric(outlist_ident$theormz_noise) + + # Extra output in Excel-readable format: + remove_columns <- c("fq.best", "fq.worst", "mzmin.pgrp", "mzmax.pgrp") + remove_colindex <- which(colnames(outlist_ident) %in% remove_columns) + outlist_ident <- outlist_ident[, -remove_colindex] + write.table(outlist_ident, file = paste0("outlist_identified_", scanmode, ".txt"), sep = "\t", row.names = FALSE) + # output in RData format + save(outlist_ident, file = paste0("outlist_identified_", scanmode, ".RData")) +} diff --git a/DIMS/CollectFilled.nf b/DIMS/CollectFilled.nf new file mode 100644 index 0000000..53b1967 --- /dev/null +++ b/DIMS/CollectFilled.nf @@ -0,0 +1,19 @@ +process CollectFilled { + tag "DIMS CollectFilled" + label 'CollectFilled' + container = 'docker://umcugenbioinf/dims:1.3' + shell = ['/bin/bash', '-euo', 'pipefail'] + + input: + path(filled_files) + each path(replication_pattern) + + output: + path('outlist*.txt') + path('outlist*.RData'), emit: filled_pgrlist + + script: + """ + Rscript ${baseDir}/CustomModules/DIMS/CollectFilled.R $params.scripts_dir $params.ppm $params.zscore + """ +} diff --git a/DIMS/CollectSumAdducts.R b/DIMS/CollectSumAdducts.R new file mode 100755 index 0000000..f4f4723 --- /dev/null +++ b/DIMS/CollectSumAdducts.R @@ -0,0 +1,21 @@ +## adapted from 12-collectSamplesAdded.R + +# define parameters +cmd_args <- commandArgs(trailingOnly = TRUE) + +# collect all AdductSums part files for each scanmode +scanmodes <- c("positive", "negative") + +for (scanmode in scanmodes) { + adductsum_part_files <- list.files("./", pattern = scanmode) + + outlist.tot <- NULL + for (i in 1:length(adductsum_part_files)) { + load(adductsum_part_files[i]) + outlist.tot <- rbind(outlist.tot, adductsum) + } + + # save output file + save(outlist.tot, file = paste0("AdductSums_", scanmode, ".RData")) +} + diff --git a/DIMS/CollectSumAdducts.nf b/DIMS/CollectSumAdducts.nf new file mode 100644 index 0000000..2bd43ff --- /dev/null +++ b/DIMS/CollectSumAdducts.nf @@ -0,0 +1,17 @@ +process CollectSumAdducts { + tag "DIMS CollectSumAdducts" + label 'CollectSumAdducts' + container = 'docker://umcugenbioinf/dims:1.3' + shell = ['/bin/bash', '-euo', 'pipefail'] + + input: + path(collect_files) + + output: + path('AdductSums_*.RData') + + script: + """ + Rscript ${baseDir}/CustomModules/DIMS/CollectSumAdducts.R + """ +} diff --git a/DIMS/FillMissing.R b/DIMS/FillMissing.R new file mode 100755 index 0000000..a76c163 --- /dev/null +++ b/DIMS/FillMissing.R @@ -0,0 +1,44 @@ +## adapted from 9-runFillMissing.R + +# define parameters +cmd_args <- commandArgs(trailingOnly = TRUE) + +peakgrouplist_file <- cmd_args[1] +scripts_dir <- cmd_args[2] +thresh <- as.numeric(cmd_args[3]) +resol <- as.numeric(cmd_args[4]) +ppm <- as.numeric(cmd_args[5]) +outdir <- "./" + +# load in function scripts +source(paste0(scripts_dir, "replace_zeros.R")) +source(paste0(scripts_dir, "fit_optim.R")) +source(paste0(scripts_dir, "get_fwhm.R")) +source(paste0(scripts_dir, "get_stdev.R")) +source(paste0(scripts_dir, "estimate_area.R")) +source(paste0(scripts_dir, "optimize_gaussfit.R")) +source(paste0(scripts_dir, "identify_noisepeaks.R")) +source(paste0(scripts_dir, "get_element_info.R")) +source(paste0(scripts_dir, "atomic_info.R")) + +# determine scan mode +if (grepl("_pos", peakgrouplist_file)) { + scanmode <- "positive" +} else if (grepl("_neg", peakgrouplist_file)) { + scanmode <- "negative" +} + +# get replication pattern for sample names +pattern_file <- paste0(scanmode, "_repl_pattern.RData") +repl_pattern <- get(load(pattern_file)) + +# load peak group list and determine output file name +outpgrlist_identified <- get(load(peakgrouplist_file)) + +outputfile_name <- gsub(".RData", "_filled.RData", peakgrouplist_file) + +# replace missing values (zeros) with random noise +peakgrouplist_filled <- replace_zeros(outpgrlist_identified, repl_pattern, scanmode, resol, outdir, thresh, ppm) + +# save output +save(peakgrouplist_filled, file = outputfile_name) diff --git a/DIMS/FillMissing.nf b/DIMS/FillMissing.nf new file mode 100644 index 0000000..b0d1850 --- /dev/null +++ b/DIMS/FillMissing.nf @@ -0,0 +1,18 @@ +process FillMissing { + tag "DIMS FillMissing ${peakgrouplist_file}" + label 'FillMissing' + container = 'docker://umcugenbioinf/dims:1.3' + shell = ['/bin/bash', '-euo', 'pipefail'] + + input: + path(peakgrouplist_file) + each path(replication_pattern) + + output: + path('*_filled.RData') + + script: + """ + Rscript ${baseDir}/CustomModules/DIMS/FillMissing.R $peakgrouplist_file $params.scripts_dir $params.thresh $params.resolution $params.ppm + """ +} diff --git a/DIMS/GenerateBreaks.R b/DIMS/GenerateBreaks.R new file mode 100644 index 0000000..6aacfe8 --- /dev/null +++ b/DIMS/GenerateBreaks.R @@ -0,0 +1,51 @@ +## adapted from 1-generateBreaksFwhm.HPC.R ## + +# load required package +suppressPackageStartupMessages(library("xcms")) + +# define parameters +cmd_args <- commandArgs(trailingOnly = TRUE) + +filepath <- cmd_args[1] +outdir <- cmd_args[2] +trim <- as.numeric(cmd_args[3]) +resol <- as.numeric(cmd_args[4]) + +# initialize +trim_left <- NULL +trim_right <- NULL +breaks_fwhm <- NULL +breaks_fwhm_avg <- NULL +bins <- NULL + +# read in mzML file +raw_data <- suppressMessages(xcms::xcmsRaw(filepath)) + +# trim (remove) scans at the start and end +trim_left <- round(raw_data@scantime[length(raw_data@scantime) * trim]) +trim_right <- round(raw_data@scantime[length(raw_data@scantime) * (1 - trim)]) + +# Mass range m/z +low_mz <- raw_data@mzrange[1] +high_mz <- raw_data@mzrange[2] + +# determine number of segments (bins) +nr_segments <- 2 * (high_mz - low_mz) +segment <- seq(from = low_mz, to = high_mz, length.out = nr_segments + 1) + +# determine start and end of each bin. +for (i in 1:nr_segments) { + start_segment <- segment[i] + end_segment <- segment[i+1] + resol_mz <- resol * (1 / sqrt(2) ^ (log2(start_segment / 200))) + fwhm_segment <- start_segment / resol_mz + breaks_fwhm <- c(breaks_fwhm, seq(from = (start_segment + fwhm_segment), to = end_segment, by = 0.2 * fwhm_segment)) + # average the m/z instead of start value + range <- seq(from = (start_segment + fwhm_segment), to = end_segment, by = 0.2 * fwhm_segment) + delta_mz <- range[2] - range[1] + breaks_fwhm_avg <- c(breaks_fwhm_avg, range + 0.5 * delta_mz) +} + +# generate output file +save(breaks_fwhm, breaks_fwhm_avg, trim_left, trim_right, file = "breaks.fwhm.RData") +save(high_mz, file = "highest_mz.RData") diff --git a/DIMS/GenerateBreaks.nf b/DIMS/GenerateBreaks.nf new file mode 100644 index 0000000..af617a8 --- /dev/null +++ b/DIMS/GenerateBreaks.nf @@ -0,0 +1,19 @@ +process GenerateBreaks { + tag "DIMS GenerateBreaks" + label 'GenerateBreaks' + container = 'docker://umcugenbioinf/dims:1.3' + shell = ['/bin/bash', '-euo', 'pipefail'] + + input: + tuple(val(file_id), path(mzML_file)) + + + output: + path('breaks.fwhm.RData'), emit: breaks + path('highest_mz.RData'), emit: highest_mz + + script: + """ + Rscript ${baseDir}/CustomModules/DIMS/GenerateBreaks.R $mzML_file ./ $params.trim $params.resolution + """ +} diff --git a/DIMS/GenerateExcel.R b/DIMS/GenerateExcel.R new file mode 100644 index 0000000..d36ac29 --- /dev/null +++ b/DIMS/GenerateExcel.R @@ -0,0 +1,643 @@ +## adapted from 13-excelExport.R + +# load required packages +library("ggplot2") +library("reshape2") +library("openxlsx") +library("loder") +suppressMessages(library("dplyr")) +suppressMessages(library("stringr")) + +# define parameters +cmd_args <- commandArgs(trailingOnly = TRUE) + +init_file <- cmd_args[1] +project <- cmd_args[2] +dims_matrix <- cmd_args[3] +hmdb_file <- cmd_args[4] +z_score <- as.numeric(cmd_args[5]) + +round_df <- function(df, digits) { + #' Round numbers to a set number of digits for numeric values + #' + #' @param df: Dataframe containing numeric values + #' @param digits: Number of digits to round off to (integer) + #' + #' @return df: Dataframe with rounded numbers + numeric_columns <- sapply(df, mode) == "numeric" + df[numeric_columns] <- round(df[numeric_columns], digits) + return(df) +} + +robust_scaler <- function(control_intensities, control_col_ids, perc = 5) { + #' Calculate robust scaler: Z-score based on controls without outliers + #' + #' @param control_intensities: Matrix with intensities for control samples + #' @param control_col_ids: Vector with column names for control samples + #' @param perc: Percentage of outliers which will be removed from controls (float) + #' + #' @return trimmed_control_intensities: Intensities trimmed for outliers + nr_to_remove <- ceiling(length(control_col_ids) * perc / 100) + sorted_control_intensities <- sort(as.numeric(control_intensities)) + trimmed_control_intensities <- sorted_control_intensities[(nr_to_remove + 1) : + (length(sorted_control_intensities) - nr_to_remove)] + return(trimmed_control_intensities) +} + +# Initialise +plot <- TRUE +export <- TRUE +control_label <- "C" +case_label <- "P" +imagesize_multiplier <- 2 +# setting outdir to export files to the working directory +outdir <- "./" +# percentage of outliers to remove from calculation of robust scaler +perc <- 5 + +# load information on samples +load(init_file) +# load the HMDB file with info on biological relevance of metabolites +load(hmdb_file) + +# get current date +rundate <- Sys.Date() + +# create a directory for plots in project directory +dir.create(paste0(outdir, "/plots"), showWarnings = FALSE) +plot_dir <- paste0(outdir, "/plots/adducts") +dir.create(plot_dir, showWarnings = FALSE) + +# set the number of digits for floats +options(digits = 16) + +# load positive and negative adduct sums +load("AdductSums_negative.RData") +outlist_neg_adducts_hmdb <- outlist.tot +load("AdductSums_positive.RData") +outlist_pos_adducts_hmdb <- outlist.tot +rm(outlist.tot) + +# Only continue with patients (columns) that are in both pos and neg, so patients that are in both +tmp <- intersect(colnames(outlist_neg_adducts_hmdb), colnames(outlist_pos_adducts_hmdb)) +outlist_neg_adducts_hmdb <- outlist_neg_adducts_hmdb[, tmp] +outlist_pos_adducts_hmdb <- outlist_pos_adducts_hmdb[, tmp] + +# Find indexes of neg hmdb code that are also found in pos and vice versa +index_neg <- which(rownames(outlist_neg_adducts_hmdb) %in% rownames(outlist_pos_adducts_hmdb)) +index_pos <- which(rownames(outlist_pos_adducts_hmdb) %in% rownames(outlist_neg_adducts_hmdb)) + +# Only continue with HMDB codes (rows) that were found in both positive mode and remove last column (hmdb_name) +tmp_pos <- outlist_pos_adducts_hmdb[rownames(outlist_pos_adducts_hmdb)[index_pos], 1:(dim(outlist_pos_adducts_hmdb)[2] - 1)] +tmp_hmdb_name_pos <- outlist_pos_adducts_hmdb[rownames(outlist_pos_adducts_hmdb)[index_pos], dim(outlist_pos_adducts_hmdb)[2]] +tmp_pos_left <- outlist_pos_adducts_hmdb[-index_pos, ] +# same for negative mode +tmp_neg <- outlist_neg_adducts_hmdb[rownames(outlist_pos_adducts_hmdb)[index_pos], 1:(dim(outlist_neg_adducts_hmdb)[2] - 1)] +tmp_neg_left <- outlist_neg_adducts_hmdb[-index_neg, ] + +# Combine positive and negative numbers and paste back HMDB column +tmp <- apply(tmp_pos, 2, as.numeric) + apply(tmp_neg, 2, as.numeric) +rownames(tmp) <- rownames(tmp_pos) +tmp <- cbind(tmp, "HMDB_name" = tmp_hmdb_name_pos) +outlist <- rbind(tmp, tmp_pos_left, tmp_neg_left) + +# Filter for biological relevance +peaks_in_list <- which(rownames(outlist) %in% rownames(rlvnc)) +outlist <- cbind(outlist[peaks_in_list, ], as.data.frame(rlvnc[rownames(outlist)[peaks_in_list], ])) +# filter out all irrelevant HMDBs +outlist <- outlist %>% + tibble::rownames_to_column("rowname") %>% + filter(!grepl("Exogenous|Drug|exogenous", relevance)) %>% + tibble::column_to_rownames("rowname") + +# Add HMDB_code column with all the HMDB ID and sort on it +outlist <- cbind(outlist, "HMDB_code" = rownames(outlist)) +outlist <- outlist[order(outlist[, "HMDB_code"]), ] + +# Create excel +filelist <- "AllPeakGroups" +wb <- createWorkbook("SinglePatient") +addWorksheet(wb, filelist) + +# Add Z-scores and create plots +if (z_score == 1) { + # add a column for plots + outlist <- cbind(plots = NA, outlist) + # two columns will be added for mean and stdev of controls; Z-scores start at ncol + 3 + startcol <- ncol(outlist) + 3 + + # Get columns with control intensities + control_col_ids <- grep(control_label, colnames(outlist), fixed = TRUE) + control_columns <- as.data.frame(outlist[, control_col_ids]) + colnames(control_columns) <- colnames(outlist)[control_col_ids] + + # Get columns with patient intensities + patient_col_ids <- grep(case_label, colnames(outlist), fixed = TRUE) + patient_columns <- as.data.frame(outlist[, patient_col_ids]) + colnames(patient_columns) <- colnames(outlist)[patient_col_ids] + + intensity_col_ids <- c(control_col_ids, patient_col_ids) + + # if there are any intensities of 0 left, set them to NA for stats + outlist[, intensity_col_ids][outlist[, intensity_col_ids] == 0] <- NA + + # save outlist as it is and use it to calculate robust scaler + outlist_noZ <- outlist + + # calculate mean and sd for Control group + outlist$avg.ctrls <- apply(control_columns, 1, function(x) mean(as.numeric(x), na.rm = TRUE)) + outlist$sd.ctrls <- apply(control_columns, 1, function(x) sd(as.numeric(x), na.rm = TRUE)) + + # Make and add columns with zscores + colnames_z <- NULL + for (i in intensity_col_ids) { + cname <- colnames(outlist)[i] + colnames_z <- c(colnames_z, paste(cname, "Zscore", sep = "_")) + zscores_1col <- (as.numeric(as.vector(unlist(outlist[, i]))) - outlist$avg.ctrls) / outlist$sd.ctrls + outlist <- cbind(outlist, zscores_1col) + } + colnames(outlist)[startcol:ncol(outlist)] <- colnames_z + + # calculate robust scaler (Zscores minus outliers in Controls) + outlist_noZ$avg.ctrls <- 0 + outlist_noZ$sd.ctrls <- 0 + + # only calculate robust Z-scores if there are enough Controls + if (length(control_col_ids) > 10) { + for (metabolite_index in 1:nrow(outlist)) { + outlist_noZ$avg.ctrls[metabolite_index] <- mean(robust_scaler(outlist_noZ[metabolite_index, control_col_ids], + control_col_ids, perc)) + outlist_noZ$sd.ctrls[metabolite_index] <- sd(robust_scaler(outlist_noZ[metabolite_index, control_col_ids], + control_col_ids, perc)) + } + } + + # Make and add columns with robust zscores + cnames_robust <- gsub("_Zscore", "_RobustZscore", colnames_z) + for (i in intensity_col_ids) { + zscores_1col <- (as.numeric(as.vector(unlist(outlist_noZ[, i]))) - outlist_noZ$avg.ctrls) / outlist_noZ$sd.ctrls + outlist_noZ <- cbind(outlist_noZ, zscores_1col) + } + colnames(outlist_noZ)[startcol:ncol(outlist_noZ)] <- cnames_robust + + # output metabolites filtered on relevance + save(outlist, file = paste0("AdductSums_filtered_Zscores.RData")) + write.table(outlist, file = paste0("AdductSums_filtered_Zscores.txt"), sep = "\t", row.names = FALSE) + # output filtered metabolites with robust scaled Zscores + save(outlist_noZ, file = paste0("AdductSums_filtered_robustZ.RData")) + write.table(outlist_noZ, file = paste0("AdductSums_filtered_robustZ.txt"), sep = "\t", row.names = FALSE) + + # get the IDs of the patients and sort + patient_ids <- unique(as.vector(unlist(lapply(strsplit(colnames(patient_columns), ".", fixed = TRUE), function(x) x[1])))) + patient_ids <- patient_ids[order(nchar(patient_ids), patient_ids)] + + # for every row, make boxplot, insert into excel, and calculate Zscore for every patient + temp_png <- NULL + for (p in 1:nrow(outlist)) { + # get HMDB ID + hmdb_name <- rownames(outlist[p, ]) + # get intensities per metabolite for box plot for control samples + intensities <- list(as.numeric(as.vector(unlist(control_columns[p, ])))) + labels <- c("C", patient_ids) + # get intensities per metabolite for box plot for patient samples + for (i in 1:length(patient_ids)) { + id <- patient_ids[i] + # combine all intensities that start with the same string for patients + # exception: if there is only one patient id, skip this step; nothing to combine + if (ncol(patient_columns) > 1) { + patient_int <- as.numeric(as.vector(unlist(outlist[p, names(patient_columns[1, ]) + [startsWith(names(patient_columns[1, ]), paste0(id, "."))]]))) + } else { + patient_int <- as.numeric(unlist(as.vector(patient_columns))) + } + intensities[[i + 1]] <- patient_int + } + intensities <- setNames(intensities, labels) + + plot_width <- length(labels) * 16 + 90 + + plot.new() + if (export) { + png(filename = paste0(plot_dir, "/", hmdb_name, "_box.png"), + width = plot_width, + height = 280) + } + # set margins + par(oma = c(2, 0, 0, 0)) + boxplot(intensities, + col = c("green", rep("red", length(intensities) - 1)), + names.arg = labels, + las = 2, + main = hmdb_name) + dev.off() + + file_png <- paste0(plot_dir, "/", hmdb_name, "_box.png") + if (is.null(temp_png)) { + temp_png <- loder::readPng(file_png) + img_dim <- dim(temp_png)[c(1, 2)] + cell_dim <- img_dim * imagesize_multiplier + setColWidths(wb, filelist, cols = 1, widths = cell_dim[2] / 20) + } + + openxlsx::insertImage(wb, + filelist, + file_png, + startRow = p + 1, + startCol = 1, + height = cell_dim[1], + width = cell_dim[2], + units = "px") + + if (p %% 100 == 0) { + cat("at row: ", p, "\n") + } + } + + openxlsx::setRowHeights(wb, filelist, rows = c(1:nrow(outlist) + 1), heights = cell_dim[1] / 4) + openxlsx::setColWidths(wb, filelist, cols = c(2:ncol(outlist)), widths = 20) +} else { + openxlsx::setRowHeights(wb, filelist, rows = c(1:nrow(outlist)), heights = 18) + openxlsx::setColWidths(wb, filelist, cols = c(1:ncol(outlist)), widths = 20) +} + +# write Excel file +openxlsx::writeData(wb, sheet = 1, outlist, startCol = 1) +xlsx_name <- paste0(outdir, "/", project, ".xlsx") +openxlsx::saveWorkbook(wb, xlsx_name, overwrite = TRUE) +rm(wb) + +#### INTERNAL STANDARDS #### +is_list <- outlist[grep("Internal standard", outlist[, "relevance"], fixed = TRUE), ] +is_codes <- rownames(is_list) + +# if all data from one samplename (for example P195.1) is filtered out in 3-averageTechReplicates +# because of too little data (threshold parameter)i, the init.RData (repl_pattern) will contain more sample_names +# than the peak data (IS), so this data needs to be removed first, before the retrieval of the summed adducts. +# Write sample_names to a log file, to let user know that this sample_name contained no data. +sample_names_nodata <- setdiff(names(repl_pattern), names(is_list)) +if (!is.null(sample_names_nodata)) { + write.table(sample_names_nodata, file = paste(outdir, "sample_names_nodata.txt", sep = "/"), + row.names = FALSE, col.names = FALSE, quote = FALSE) + cat(sample_names_nodata, "\n") + for (sample_name in sample_names_nodata) { + repl_pattern[[sample_name]] <- NULL + } +} + +# Retrieve IS summed adducts +is_summed <- is_list[c(names(repl_pattern), "HMDB_code")] +is_summed$HMDB.name <- is_list$name +is_summed <- reshape2::melt(is_summed, id.vars = c("HMDB_code", "HMDB.name")) +colnames(is_summed) <- c("HMDB.code", "HMDB.name", "Sample", "Intensity") +is_summed$Matrix <- dims_matrix +is_summed$Rundate <- rundate +is_summed$Project <- project +is_summed$Intensity <- as.numeric(as.character(is_summed$Intensity)) + +# Retrieve IS positive mode +is_pos <- as.data.frame(subset(outlist_pos_adducts_hmdb, rownames(outlist_pos_adducts_hmdb) %in% is_codes)) +is_pos$HMDB_name <- is_list[match(row.names(is_pos), is_list$HMDB_code, nomatch = NA), "name"] +is_pos$HMDB.code <- row.names(is_pos) +is_pos <- reshape2::melt(is_pos, id.vars = c("HMDB.code", "HMDB_name")) +colnames(is_pos) <- c("HMDB.code", "HMDB.name", "Sample", "Intensity") +is_pos$Matrix <- dims_matrix +is_pos$Rundate <- rundate +is_pos$Project <- project +is_pos$Intensity <- as.numeric(as.character(is_pos$Intensity)) + +# Retrieve IS negative mode +is_neg <- as.data.frame(subset(outlist_neg_adducts_hmdb, rownames(outlist_neg_adducts_hmdb) %in% is_codes)) +is_neg$HMDB_name <- is_list[match(row.names(is_neg), is_list$HMDB_code, nomatch = NA), "name"] +is_neg$HMDB.code <- row.names(is_neg) +is_neg <- reshape2::melt(is_neg, id.vars = c("HMDB.code", "HMDB_name")) +colnames(is_neg) <- c("HMDB.code", "HMDB.name", "Sample", "Intensity") +is_neg$Matrix <- dims_matrix +is_neg$Rundate <- rundate +is_neg$Project <- project +is_neg$Intensity <- as.numeric(as.character(is_neg$Intensity)) + +# Save results +save(is_pos, is_neg, is_summed, file = paste0(outdir, "/", project, "_IS_results.RData")) + +# number of samples, for plotting length and width +sample_count <- length(repl_pattern) + +# change the order of the x-axis summed plots to a natural sorted one +sample_naturalorder <- unique(as.character(is_summed$Sample)) +sample_naturalorder <- stringr::str_sort(sample_naturalorder, numeric = TRUE) +is_summed$Sample_level <- factor(is_summed$Sample, levels = c(sample_naturalorder)) +is_pos$Sample_level <- factor(is_pos$Sample, levels = c(sample_naturalorder)) +is_neg$Sample_level <- factor(is_neg$Sample, levels = c(sample_naturalorder)) + +## bar plots with all IS +# theme for all IS bar plots +theme_is_bar <- function(my_plot) { + my_plot + + ggplot2::scale_y_continuous(breaks = scales::pretty_breaks(n = 10)) + + ggplot2::theme(legend.position = "none", + axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 6), + axis.text.y = element_text(size = 6) + ) +} + +# ggplot functions +is_neg_bar_plot <- ggplot(is_neg, aes(Sample_level, Intensity)) + + ggtitle("Interne Standaard (Neg)") + + geom_bar(aes(fill = HMDB.name), stat = "identity") + + labs(x = "", y = "Intensity") + + facet_wrap(~HMDB.name, scales = "free_y") + +is_pos_bar_plot <- ggplot(is_pos, aes(Sample_level, Intensity)) + + ggtitle("Interne Standaard (Pos)") + + geom_bar(aes(fill = HMDB.name), stat = "identity") + + labs(x = "", y = "Intensity") + + facet_wrap(~HMDB.name, scales = "free_y") + +is_sum_bar_plot <- ggplot(is_summed, aes(Sample_level, Intensity)) + + ggtitle("Interne Standaard (Summed)") + + geom_bar(aes(fill = HMDB.name), stat = "identity") + + labs(x = "", y = "Intensity") + + facet_wrap(~HMDB.name, scales = "free_y") + +# add theme to ggplot functions +is_neg_bar_plot <- theme_is_bar(is_neg_bar_plot) +is_pos_bar_plot <- theme_is_bar(is_pos_bar_plot) +is_sum_bar_plot <- theme_is_bar(is_sum_bar_plot) + +# save plots to disk +plot_width <- 9 + 0.35 * sample_count +ggsave(paste0(outdir, "/plots/IS_bar_all_neg.png"), + plot = is_neg_bar_plot, height = plot_width / 2.5, width = plot_width, units = "in") +ggsave(paste0(outdir, "/plots/IS_bar_all_pos.png"), + plot = is_pos_bar_plot, height = plot_width / 2.5, width = plot_width, units = "in") +ggsave(paste0(outdir, "/plots/IS_bar_all_sum.png"), + plot = is_sum_bar_plot, height = plot_width / 2.5, width = plot_width, units = "in") + +## Line plots with all IS +# function for ggplot theme +# add smaller legend in the "all IS line plots", otherwise out-of-range when more than 13 IS lines +theme_is_line <- function(my_plot) { + my_plot + + guides(shape = guide_legend(override.aes = list(size = 0.5)), + color = guide_legend(override.aes = list(size = 0.5)) + ) + + theme(legend.title = element_text(size = 8), + legend.text = element_text(size = 6), + legend.key.size = unit(0.7, "line"), + axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 8) + ) +} + +# ggplot functions +is_neg_line_plot <- ggplot(is_neg, aes(Sample_level, Intensity)) + + ggtitle("Interne Standaard (Neg)") + + geom_point(aes(col = HMDB.name)) + + geom_line(aes(col = HMDB.name, group = HMDB.name)) + + labs(x = "", y = "Intensity") + +is_pos_line_plot <- ggplot(is_pos, aes(Sample_level, Intensity)) + + ggtitle("Interne Standaard (Pos)") + + geom_point(aes(col = HMDB.name)) + + geom_line(aes(col = HMDB.name, group = HMDB.name)) + + labs(x = "", y = "Intensity") + +is_sum_line_plot <- ggplot(is_summed, aes(Sample_level, Intensity)) + + ggtitle("Interne Standaard (Sum)") + + geom_point(aes(col = HMDB.name)) + + geom_line(aes(col = HMDB.name, group = HMDB.name)) + + labs(x = "", y = "Intensity") + +# add theme to ggplot functions +is_sum_line_plot <- theme_is_line(is_sum_line_plot) +is_neg_line_plot <- theme_is_line(is_neg_line_plot) +is_pos_line_plot <- theme_is_line(is_pos_line_plot) + +# save plots to disk +plot_width <- 8 + 0.2 * sample_count +ggsave(paste0(outdir, "/plots/IS_line_all_neg.png"), + plot = is_neg_line_plot, height = plot_width / 2.5, width = plot_width, units = "in") +ggsave(paste0(outdir, "/plots/IS_line_all_pos.png"), + plot = is_pos_line_plot, height = plot_width / 2.5, width = plot_width, units = "in") +ggsave(paste0(outdir, "/plots/IS_line_all_sum.png"), + plot = is_sum_line_plot, height = plot_width / 2.5, width = plot_width, units = "in") + +## bar plots with a selection of IS +is_neg_selection <- c("2H2-Ornithine (IS)", "2H3-Glutamate (IS)", "2H2-Citrulline (IS)", "2H4_13C5-Arginine (IS)", + "13C6-Tyrosine (IS)") +is_pos_selection <- c("2H4-Alanine (IS)", "13C6-Phenylalanine (IS)", "2H4_13C5-Arginine (IS)", "2H3-Propionylcarnitine (IS)", + "2H9-Isovalerylcarnitine (IS)") +is_sum_selection <- c("2H8-Valine (IS)", "2H3-Leucine (IS)", "2H3-Glutamate (IS)", "2H4_13C5-Arginine (IS)", + "13C6-Tyrosine (IS)") + +# add minimal intensity lines based on matrix (DBS or Plasma) and machine mode (neg, pos, sum) +if (dims_matrix == "DBS") { + hline_data_neg <- + data.frame( + z = c(15000, 200000, 130000, 18000, 50000), + HMDB.name = is_neg_selection + ) + hline_data_pos <- + data.frame( + z = c(150000, 3300000, 1750000, 150000, 270000), + HMDB.name = is_pos_selection + ) + hline_data_sum <- + data.frame( + z = c(1300000, 2500000, 500000, 1800000, 1400000), + HMDB.name = is_sum_selection + ) +} else if (dims_matrix == "Plasma") { + hline_data_neg <- + data.frame( + z = c(6500, 100000, 75000, 7500, 25000), + HMDB.name = is_neg_selection + ) + hline_data_pos <- + data.frame( + z = c(85000, 1000000, 425000, 70000, 180000), + HMDB.name = is_pos_selection + ) + hline_data_sum <- + data.frame( + z = c(700000, 1250000, 150000, 425000, 300000), + HMDB.name = is_sum_selection + ) +} + +# ggplot functions +is_neg_selection_barplot <- ggplot(subset(is_neg, HMDB.name %in% is_neg_selection), aes(Sample_level, Intensity)) + + ggtitle("Interne Standaard (Neg)") + + geom_bar(aes(fill = HMDB.name), stat = "identity") + + labs(x = "", y = "Intensity") + + facet_wrap(~HMDB.name, scales = "free", ncol = 2) + + if (exists("hline_data_neg")) { + geom_hline(aes(yintercept = z), subset(hline_data_neg, HMDB.name %in% is_neg$HMDB.name)) + } + +is_pos_selection_barplot <- ggplot(subset(is_pos, HMDB.name %in% is_pos_selection), aes(Sample_level, Intensity)) + + ggtitle("Interne Standaard (Pos)") + + geom_bar(aes(fill = HMDB.name), stat = "identity") + + labs(x = "", y = "Intensity") + + facet_wrap(~HMDB.name, scales = "free", ncol = 2) + + if (exists("hline_data_pos")) { + geom_hline(aes(yintercept = z), subset(hline_data_pos, HMDB.name %in% is_pos$HMDB.name)) + } + +is_sum_selection_barplot <- ggplot(subset(is_summed, HMDB.name %in% is_sum_selection), aes(Sample_level, Intensity)) + + ggtitle("Interne Standaard (Sum)") + + geom_bar(aes(fill = HMDB.name), stat = "identity") + + labs(x = "", y = "Intensity") + + facet_wrap(~HMDB.name, scales = "free", ncol = 2) + + if (exists("hline_data_sum")) { + geom_hline(aes(yintercept = z), subset(hline_data_sum, HMDB.name %in% is_summed$HMDB.name)) + } + +# add theme to ggplot functions +is_neg_selection_barplot <- theme_is_bar(is_neg_selection_barplot) +is_pos_selection_barplot <- theme_is_bar(is_pos_selection_barplot) +is_sum_selection_barplot <- theme_is_bar(is_sum_selection_barplot) + +# save plots to disk +plot_width <- 9 + 0.35 * sample_count +ggsave(paste0(outdir, "/plots/IS_bar_select_neg.png"), + plot = is_neg_selection_barplot, height = plot_width / 2.0, width = plot_width, units = "in") +ggsave(paste0(outdir, "/plots/IS_bar_select_pos.png"), + plot = is_pos_selection_barplot, height = plot_width / 2.0, width = plot_width, units = "in") +ggsave(paste0(outdir, "/plots/IS_bar_select_sum.png"), + plot = is_sum_selection_barplot, height = plot_width / 2.0, width = plot_width, units = "in") + +## line plots with a selection of IS +# ggplot functions +is_neg_selection_lineplot <- ggplot(subset(is_neg, HMDB.name %in% is_neg_selection), aes(Sample_level, Intensity)) + + ggtitle("Interne Standaard (Neg)") + + geom_point(aes(col = HMDB.name)) + + geom_line(aes(col = HMDB.name, group = HMDB.name)) + + labs(x = "", y = "Intensity") + +is_pos_selection_lineplot <- ggplot(subset(is_pos, HMDB.name %in% is_pos_selection), aes(Sample_level, Intensity)) + + ggtitle("Interne Standaard (Pos)") + + geom_point(aes(col = HMDB.name)) + + geom_line(aes(col = HMDB.name, group = HMDB.name)) + + labs(x = "", y = "Intensity") + +is_sum_selection_lineplot <- ggplot(subset(is_summed, HMDB.name %in% is_sum_selection), aes(Sample_level, Intensity)) + + ggtitle("Interne Standaard (Sum)") + + geom_point(aes(col = HMDB.name)) + + geom_line(aes(col = HMDB.name, group = HMDB.name)) + + labs(x = "", y = "Intensity") + +# add theme to ggplot functions +is_neg_selection_lineplot <- theme_is_line(is_neg_selection_lineplot) +is_pos_selection_lineplot <- theme_is_line(is_pos_selection_lineplot) +is_sum_selection_lineplot <- theme_is_line(is_sum_selection_lineplot) + +# save plots to disk +plot_width <- 8 + 0.2 * sample_count +ggsave(paste0(outdir, "/plots/IS_line_select_neg.png"), + plot = is_neg_selection_lineplot, height = plot_width / 2.5, width = plot_width, units = "in") +ggsave(paste0(outdir, "/plots/IS_line_select_pos.png"), + plot = is_pos_selection_lineplot, height = plot_width / 2.5, width = plot_width, units = "in") +ggsave(paste0(outdir, "/plots/IS_line_select_sum.png"), + plot = is_sum_selection_lineplot, height = plot_width / 2.5, width = plot_width, units = "in") + + +### POSITIVE CONTROLS CHECK +# these positive controls need to be in the samplesheet, in order to make the positive_control.RData file +# Positive control samples all have the format P1002.x, P1003.x and P1005.x (where x is a number) + +column_list <- colnames(outlist) +patterns <- c("^(P1002\\.)[[:digit:]]+_", "^(P1003\\.)[[:digit:]]+_", "^(P1005\\.)[[:digit:]]+_") +positive_controls_index <- grepl(pattern = paste(patterns, collapse = "|"), column_list) +positive_control_list <- column_list[positive_controls_index] + +if (z_score == 1) { + # find if one or more positive control samples are missing + pos_contr_warning <- c() + if (any(grep("^(P1002\\.)[[:digit:]]+_", positive_control_list)) && + any(grep("^(P1003\\.)[[:digit:]]+_", positive_control_list)) && + any(grep("^(P1005\\.)[[:digit:]]+_", positive_control_list))) { + cat("All three positive controls are present") + } else { + pos_contr_warning <- paste0(c("positive controls list is not complete. Only ", + positive_control_list, " is/are present"), collapse = " ") + } + # you need all positive control samples, thus starting the script only if all are available + if (length(pos_contr_warning) == 0) { + # make positive control excel with specific HMDB_codes in combination with specific control samples + pa_sample_name <- positive_control_list[grepl("P1002", positive_control_list)] + pku_sample_name <- positive_control_list[grepl("P1003", positive_control_list)] + lpi_sample_name <- positive_control_list[grepl("P1005", positive_control_list)] + + pa_codes <- c("HMDB00824", "HMDB00783", "HMDB00123") + pku_codes <- c("HMDB00159") + lpi_codes <- c("HMDB00904", "HMDB00641", "HMDB00182") + + pa_data <- outlist[pa_codes, c("HMDB_code", "name", pa_sample_name)] + pa_data <- reshape2::melt(pa_data, id.vars = c("HMDB_code", "name")) + colnames(pa_data) <- c("HMDB.code", "HMDB.name", "Sample", "Zscore") + + pku_data <- outlist[pku_codes, c("HMDB_code", "name", pku_sample_name)] + pku_data <- reshape2::melt(pku_data, id.vars = c("HMDB_code", "name")) + colnames(pku_data) <- c("HMDB.code", "HMDB.name", "Sample", "Zscore") + + lpi_data <- outlist[lpi_codes, c("HMDB_code", "name", lpi_sample_name)] + lpi_data <- reshape2::melt(lpi_data, id.vars = c("HMDB_code", "name")) + colnames(lpi_data) <- c("HMDB.code", "HMDB.name", "Sample", "Zscore") + + positive_control <- rbind(pa_data, pku_data, lpi_data) + positive_control$Zscore <- as.numeric(positive_control$Zscore) + # extra information added to excel for future reference. made in beginning of this script + positive_control$Matrix <- dims_matrix + positive_control$Rundate <- rundate + positive_control$Project <- project + + # Save results + save(positive_control, file = paste0(outdir, "/", project, "_positive_control.RData")) + # round the Z-scores to 2 digits + positive_control$Zscore <- round_df(positive_control$Zscore, 2) + write.xlsx(positive_control, file = paste0(outdir, "/", project, "_positive_control.xlsx"), + sheetName = "Sheet1", col.names = TRUE, row.names = TRUE, append = FALSE) + } else { + write.table(pos_contr_warning, file = paste(outdir, "positive_controls_warning.txt", sep = "/"), + row.names = FALSE, col.names = FALSE, quote = FALSE) + } +} + +### MISSING M/Z CHECK +# check the outlist_identified_(negative/positive).RData files for missing m/z values and mention in the results mail +# Load the outlist_identified files + remove the loaded files +load(paste0(outdir, "/outlist_identified_negative.RData")) +outlist_ident_neg <- outlist_ident +load(paste0(outdir, "/outlist_identified_positive.RData")) +outlist_ident_pos <- outlist_ident +rm(outlist_ident) +# check for missing m/z in negative and positive mode +scanmode <- c("Negative", "Positive") +index <- 1 +results_ident <- c() +outlist_ident_list <- list(outlist_ident_neg, outlist_ident_pos) +for (outlist_ident in outlist_ident_list) { + current_mode <- scanmode[index] + # retrieve all unique m/z values in whole numbers and check if all are available + mz_values <- as.numeric(unique(format(outlist_ident$mzmed.pgrp, digits = 0))) + # m/z range for a standard run = 70-600 + mz_range <- seq(70, 599, by = 1) + mz_missing <- c() + for (mz in mz_range) { + if (!mz %in% mz_values) { + mz_missing <- c(mz_missing, mz) + } + } + y <- mz_missing + # check if m/z are missing and make an .txt file with information + group_ident <- cumsum(c(1, abs(y[-length(y)] - y[-1]) > 1)) + if (length(group_ident) > 1) { + results_ident <- c(results_ident, paste0("Missing m/z values ", current_mode, " mode")) + results_ident <- c(results_ident, by(y, group_ident, identity)) + } else { + results_ident <- c(results_ident, paste0(current_mode, " mode did not have missing mz values")) + } + # change to other scanmode + index <- index + 1 +} +lapply(results_ident, write, file = paste0(outdir, "/missing_mz_warning.txt"), append = TRUE, ncolumns = 1000) diff --git a/DIMS/GenerateExcel.nf b/DIMS/GenerateExcel.nf new file mode 100644 index 0000000..6134766 --- /dev/null +++ b/DIMS/GenerateExcel.nf @@ -0,0 +1,24 @@ +process GenerateExcel { + tag "DIMS GenerateExcel" + label 'GenerateExcel' + container = 'docker://umcugenbioinf/dims:1.3' + shell = ['/bin/bash', '-euo', 'pipefail'] + + input: + path(collect_files) + path(identified_files) + path(init_file) + val(analysis_id) + path(relevance_file) + + output: + path('*.RData') + path('*.txt') + path('*.xlsx'), emit: excel_files + path('plots/*.png'), emit: plot_files + + script: + """ + Rscript ${baseDir}/CustomModules/DIMS/GenerateExcel.R $init_file $analysis_id $params.matrix $relevance_file $params.zscore + """ +} diff --git a/DIMS/GenerateViolinPlots.R b/DIMS/GenerateViolinPlots.R new file mode 100644 index 0000000..29f41cf --- /dev/null +++ b/DIMS/GenerateViolinPlots.R @@ -0,0 +1,487 @@ +# For untargeted metabolomics, this tool calculates probability scores for +# metabolic disorders. In addition, it provides visual support with violin plots +# of the DIMS measurements for the lab specialists. +# Input needed: +# 1. Excel file in which metabolites are listed with their intensities for +# controls (with C in samplename) and patients (with P in samplename) and their +# corresponding Z-scores. +# 2. All files from github: https://github.com/UMCUGenetics/DIMS + +## adapted from 15-dIEM_violin.R + +# load packages +suppressPackageStartupMessages(library("dplyr")) +library(reshape2) +library(openxlsx) +library(ggplot2) +suppressPackageStartupMessages(library("gridExtra")) +library(stringr) + +# define parameters +cmd_args <- commandArgs(trailingOnly = TRUE) + +run_name <- cmd_args[1] +scripts_dir <- cmd_args[2] +z_score <- as.numeric(cmd_args[3]) +path_metabolite_groups <- cmd_args[4] +file_ratios_metabolites <- cmd_args[5] +file_expected_biomarkers_iem <- cmd_args[6] +file_explanation <- cmd_args[7] +file_isomers <- cmd_args[8] + +if (z_score == 1){ + # path: output folder for dIEM and violin plots + output_dir <- "./" + + file.copy(file_isomers, output_dir) + + # load functions + source(paste0(scripts_dir, "check_same_samplename.R")) + source(paste0(scripts_dir, "prepare_data.R")) + source(paste0(scripts_dir, "prepare_data_perpage.R")) + source(paste0(scripts_dir, "prepare_toplist.R")) + source(paste0(scripts_dir, "create_violin_plots.R")) + source(paste0(scripts_dir, "prepare_alarmvalues.R")) + source(paste0(scripts_dir, "output_helix.R")) + source(paste0(scripts_dir, "get_patient_data_to_helix.R")) + source(paste0(scripts_dir, "add_lab_id_and_onderzoeksnummer.R")) + source(paste0(scripts_dir, "is_diagnostic_patient.R")) + + # number of diseases that score highest in algorithm to plot + top_nr_iem <- 5 + # probability score cut-off for plotting the top diseases + threshold_iem <- 5 + # z-score cutoff of axis on the left for top diseases + ratios_cutoff <- -5 + # number of violin plots per page in PDF + nr_plots_perpage <- 20 + + # binary variable: run function, yes(1) or no(0) + if (z_score == 1) { + algorithm <- ratios <- violin <- 1 + } else { + algorithm <- ratios <- violin <- 0 + } + # are the sample names headers on row 1 or row 2 in the DIMS excel? (default 1) + header_row <- 1 + # column name where the data starts (default B) + col_start <- "B" + zscore_cutoff <- 5 + xaxis_cutoff <- 20 + protocol_name <- "DIMS_PL_DIAG" + + #### STEP 1: Preparation #### + # in: run_name, path_dims_file, header_row ||| out: output_dir, DIMS + + # path to DIMS excel file + path_dims_file <- paste0(run_name, ".xlsx") + + # Load the excel file. + dims_xls <- readWorkbook(xlsxFile = path_dims_file, sheet = 1, startRow = header_row) + if (exists("dims_xls")) { + cat(paste0("\nThe excel file is succesfully loaded:\n -> ", path_dims_file)) + } else { + cat(paste0("\n\n**** Error: Could not find an Excel file. Please check location of file:\n -> ", path_dims_file, "\n")) + } + + #### STEP 2: Edit DIMS data ##### + # in: dims_xls ||| out: Data, nr_contr, nr_pat + # Input: the xlsx file that comes out of the pipeline with format: + # [plots] [C] [P] [summary columns] [C_Zscore] [P_Zscore] + # Output: "_CSV.csv" file that is suited for the algorithm in shiny. + + # Determine the number of Contols and Patients in column names: + nr_contr <- length(grep("C", names(dims_xls))) / 2 + nr_pat <- length(grep("P", names(dims_xls))) / 2 + # total number of samples + nrsamples <- nr_contr + nr_pat + # check whether the number of intensity columns equals the number of Zscore columns + if (nr_contr + nr_pat != length(grep("_Zscore", names(dims_xls)))) { + cat("\n**** Error: there aren't as many intensities listed as Zscores") + } + cat(paste0("\n\n------------\n", nr_contr, " controls \n", nr_pat, " patients\n------------\n\n")) + + # Move the columns HMDB_code and HMDB_name to the beginning. + hmdb_info_cols <- c(which(colnames(dims_xls) == "HMDB_code"), which(colnames(dims_xls) == "HMDB_name")) + other_cols <- seq_along(1:ncol(dims_xls))[-hmdb_info_cols] + dims_xls_copy <- dims_xls[, c(hmdb_info_cols, other_cols)] + # Remove the columns from 'name' to 'pathway' + from_col <- which(colnames(dims_xls_copy) == "name") + to_col <- which(colnames(dims_xls_copy) == "pathway") + dims_xls_copy <- dims_xls_copy[, -c(from_col:to_col)] + # in case the excel had an empty "plots" column, remove it + if ("plots" %in% colnames(dims_xls_copy)) { + dims_xls_copy <- dims_xls_copy[, -grep("plots", colnames(dims_xls_copy))] + } + # Rename columns + names(dims_xls_copy) <- gsub("avg.ctrls", "Mean_controls", names(dims_xls_copy)) + names(dims_xls_copy) <- gsub("sd.ctrls", "SD_controls", names(dims_xls_copy)) + names(dims_xls_copy) <- gsub("HMDB_code", "HMDB.code", names(dims_xls_copy)) + names(dims_xls_copy) <- gsub("HMDB_name", "HMDB.name", names(dims_xls_copy)) + + # intensity columns and mean and standard deviation of controls + numeric_cols <- c(3:ncol(dims_xls_copy)) + # make sure all values are numeric + dims_xls_copy[, numeric_cols] <- sapply(dims_xls_copy[, numeric_cols], as.numeric) + + if (exists("dims_xls_copy") & (length(dims_xls_copy) < length(dims_xls))) { + cat("\n### Step 2 # Edit dims data is done.\n") + } else { + cat("\n**** Error: Could not execute step 2 \n") + } + + #### STEP 3: Calculate ratios of intensities for metabolites #### + # in: ratios, file_ratios_metabolites, dims_xls_copy, nr_contr, nr_pat ||| out: Zscore (+file) + # This script loads the file with Ratios (file_ratios_metabolites) and calculates + # the ratios of the intensities of the given metabolites. It also calculates + # Zs-cores based on the avg and sd of the ratios of the controls. + + # Input: dataframe with intenstities and Zscores of controls and patients: + # [HMDB.code] [HMDB.name] [C] [P] [Mean_controls] [SD_controls] [C_Zscore] [P_Zscore] + + # Output: "_CSV.csv" file that is suited for the algorithm, with format: + # "_Ratios_CSV.csv" file, same file as above, but with ratio rows added. + + if (ratios == 1) { + cat(paste0("\nloading ratios file:\n -> ", file_ratios_metabolites, "\n")) + ratio_input <- read.csv(file_ratios_metabolites, sep = ";", stringsAsFactors = FALSE) + + # Prepare empty data frame to fill with ratios + ratio_list <- setNames(data.frame(matrix( + ncol = ncol(dims_xls_copy), + nrow = nrow(ratio_input) + )), colnames(dims_xls_copy)) + ratio_list <- as.data.frame(ratio_list) + + # put HMDB info into first two columns of ratio_list + ratio_list[, 1:2] <- ratio_input[, 1:2] + + # look for intensity columns (exclude Zscore columns) + control_cols <- grep("C", colnames(ratio_list)[1:which(colnames(ratio_list) == "Mean_controls")]) + patient_cols <- grep("P", colnames(ratio_list)[1:which(colnames(ratio_list) == "Mean_controls")]) + intensity_cols <- c(control_cols, patient_cols) + # calculate each of the ratios of intensities + for (ratio_index in 1:nrow(ratio_input)) { + ratio_numerator <- ratio_input[ratio_index, "HMDB_numerator"] + ratio_numerator <- strsplit(ratio_numerator, "plus")[[1]] + ratio_denominator <- ratio_input[ratio_index, "HMDB_denominator"] + ratio_denominator <- strsplit(ratio_denominator, "plus")[[1]] + # find these HMDB IDs in dataset. Could be a sum of multiple metabolites + sel_denominator <- sel_numerator <- c() + for (numerator_index in 1:length(ratio_numerator)) { + sel_numerator <- c(sel_numerator, which(dims_xls_copy[, "HMDB.code"] == ratio_numerator[numerator_index])) + } + for (denominator_index in 1:length(ratio_denominator)) { + # special case for sum of metabolites (dividing by one) + if (ratio_denominator[denominator_index] != "one") { + sel_denominator <- c(sel_denominator, which(dims_xls_copy[, "HMDB.code"] == ratio_denominator[denominator_index])) + } + } + # calculate ratio + if (ratio_denominator[denominator_index] != "one") { + ratio_list[ratio_index, intensity_cols] <- apply(dims_xls_copy[sel_numerator, intensity_cols], 2, sum) / + apply(dims_xls_copy[sel_denominator, intensity_cols], 2, sum) + } else { + # special case for sum of metabolites (dividing by one) + ratio_list[ratio_index, intensity_cols] <- apply(dims_xls_copy[sel_numerator, intensity_cols], 2, sum) + } + # calculate log of ratio + ratio_list[ratio_index, intensity_cols] <- log2(ratio_list[ratio_index, intensity_cols]) + } + + # Calculate means and SD's of the calculated ratios for Controls + ratio_list[, "Mean_controls"] <- apply(ratio_list[, control_cols], 1, mean) + ratio_list[, "SD_controls"] <- apply(ratio_list[, control_cols], 1, sd) + + # Calc z-scores with the means and SD's of Controls + zscore_cols <- grep("Zscore", colnames(ratio_list)) + for (sample_index in 1:length(zscore_cols)) { + zscore_col <- zscore_cols[sample_index] + # matching intensity column + int_col <- intensity_cols[sample_index] + # test on column names + if (check_same_samplename(colnames(ratio_list)[int_col], colnames(ratio_list)[zscore_col])) { + # calculate Z-scores + ratio_list[, zscore_col] <- (ratio_list[, int_col] - ratio_list[, "Mean_controls"]) / ratio_list[, "SD_controls"] + } + } + + # Add rows of the ratio hmdb codes to the data of zscores from the pipeline. + dims_xls_ratios <- rbind(ratio_list, dims_xls_copy) + + # Edit the DIMS output Zscores of all patients in format: + # HMDB_code patientname1 patientname2 + names(dims_xls_ratios) <- gsub("HMDB.code", "HMDB_code", names(dims_xls_ratios)) + names(dims_xls_ratios) <- gsub("HMDB.name", "HMDB_name", names(dims_xls_ratios)) + + # for debugging: + write.table(dims_xls_ratios, file = paste0(output_dir, "/ratios.txt"), sep = "\t") + + # Select only the cols with zscores of the patients + zscore_patients <- dims_xls_ratios[, c(1, 2, zscore_cols[grep("P", colnames(dims_xls_ratios)[zscore_cols])])] + # Select only the cols with zscores of the controls + zscore_controls <- dims_xls_ratios[, c(1, 2, zscore_cols[grep("C", colnames(dims_xls_ratios)[zscore_cols])])] + + } + + #### STEP 4: Run the IEM algorithm ######### + # in: algorithm, file_expected_biomarkers_iem, zscore_patients ||| out: prob_score (+file) + # algorithm taken from DOI: 10.3390/ijms21030979 + + if (algorithm == 1) { + # Load data + cat(paste0("\nloading expected file:\n -> ", file_expected_biomarkers_iem, "\n")) + expected_biomarkers <- read.csv(file_expected_biomarkers_iem, sep = ";", stringsAsFactors = FALSE) + # modify column names + names(expected_biomarkers) <- gsub("HMDB.code", "HMDB_code", names(expected_biomarkers)) + names(expected_biomarkers) <- gsub("Metabolite", "HMDB_name", names(expected_biomarkers)) + + # prepare dataframe scaffold rank_patients + rank_patients <- zscore_patients + # Fill df rank_patients with the ranks for each patient + for (patient_index in 3:ncol(zscore_patients)) { + # number of positive zscores in patient + pos <- sum(zscore_patients[, patient_index] > 0) + # sort the column on zscore; NB: this sorts the entire object, not just one column + rank_patients <- rank_patients[order(-rank_patients[patient_index]), ] + # Rank all positive zscores highest to lowest + rank_patients[1:pos, patient_index] <- as.numeric(ordered(-rank_patients[1:pos, patient_index])) + # Rank all negative zscores lowest to highest + rank_patients[(pos + 1):nrow(rank_patients), patient_index] <- as.numeric(ordered(rank_patients[(pos + 1): + nrow(rank_patients), patient_index])) + } + + # Calculate metabolite score, using the dataframes with only values, and later add the cols without values (1&2). + expected_zscores <- merge(x = expected_biomarkers, y = zscore_patients, by.x = c("HMDB_code"), by.y = c("HMDB_code")) + expected_zscores_original <- expected_zscores + + # determine which columns contain Z-scores and which contain disease info + select_zscore_cols <- grep("_Zscore", colnames(expected_zscores)) + select_info_cols <- 1:(min(select_zscore_cols) - 1) + # set some zscores to zero + select_incr_indisp <- which(expected_zscores$Change == "Increase" & expected_zscores$Dispensability == "Indispensable") + expected_zscores[select_incr_indisp, select_zscore_cols] <- lapply(expected_zscores[select_incr_indisp, + select_zscore_cols], function(x) ifelse (x <= 1.6, 0, x)) + select_decr_indisp <- which(expected_zscores$Change == "Decrease" & expected_zscores$Dispensability == "Indispensable") + expected_zscores[select_decr_indisp, select_zscore_cols] <- lapply(expected_zscores[select_decr_indisp, + select_zscore_cols], function(x) ifelse (x >= -1.2, 0, x)) + + # calculate rank score: + expected_ranks <- merge(x = expected_biomarkers, y = rank_patients, by.x = c("HMDB_code"), by.y = c("HMDB_code")) + rank_scores <- expected_zscores[order(expected_zscores$HMDB_code), select_zscore_cols] / + (expected_ranks[order(expected_ranks$HMDB_code), select_zscore_cols] * 0.9) + # combine disease info with rank scores + expected_metabscore <- cbind(expected_ranks[order(expected_zscores$HMDB_code), select_info_cols], rank_scores) + + # multiply weight score and rank score + weight_score <- expected_zscores + weight_score[, select_zscore_cols] <- expected_metabscore$Total_Weight * expected_metabscore[, select_zscore_cols] + + # sort table on Disease and Absolute_Weight + weight_score <- weight_score[order(weight_score$Disease, weight_score$Absolute_Weight, decreasing = TRUE), ] + + # select columns to check duplicates + dup <- weight_score[, c("Disease", "M.z")] + uni <- weight_score[!duplicated(dup) | !duplicated(dup, fromLast = FALSE), ] + + # calculate probability score + prob_score <- aggregate(uni[, select_zscore_cols], uni["Disease"], sum) + + # list of all diseases that have at least one metabolite Zscore at 0 + for (patient_index in 2:ncol(prob_score)) { + patient_zscore_colname <- colnames(prob_score)[patient_index] + matching_colname_expected <- which(colnames(expected_zscores) == patient_zscore_colname) + # determine which Zscores are 0 for this patient + zscores_zero <- which(expected_zscores[, matching_colname_expected] == 0) + # get Disease for these + disease_zero <- unique(expected_zscores[zscores_zero, "Disease"]) + # set the probability score of these diseases to 0 + prob_score[which(prob_score$Disease %in% disease_zero), patient_index] <- 0 + } + + # determine disease rank per patient + disease_rank <- prob_score + # rank diseases in decreasing order + disease_rank[2:ncol(disease_rank)] <- lapply(2:ncol(disease_rank), function(x) + as.numeric(ordered(-disease_rank[1:nrow(disease_rank), x]))) + # modify column names, Zscores have now been converted to probability scores + colnames(prob_score) <- gsub("_Zscore", "_prob_score", colnames(prob_score)) + colnames(disease_rank) <- gsub("_Zscore", "", colnames(disease_rank)) + + # Create conditional formatting for output Excel sheet. Colors according to values. + wb <- createWorkbook() + addWorksheet(wb, "Probability Scores") + writeData(wb, "Probability Scores", prob_score) + conditionalFormatting(wb, "Probability Scores", cols = 2:ncol(prob_score), rows = 1:nrow(prob_score), + type = "colourScale", style = c("white", "#FFFDA2", "red"), rule = c(1, 10, 100)) + saveWorkbook(wb, file = paste0(output_dir, "/dIEM_algoritme_output_", run_name, ".xlsx"), overwrite = TRUE) + # check whether prob_score df exists and has expected dimensions. + if (exists("expected_biomarkers") & (length(disease_rank) == length(prob_score))) { + cat("\n### Step 4 # Running the IEM algorithm is done.\n\n") + } else { + cat("\n**** Error: Could not run IEM algorithm. Check if path to expected_biomarkers csv-file is correct. \n") + } + + rm(wb) + } + + #### STEP 5: Make violin plots ##### + # in: algorithm / zscore_patients, violin, nr_contr, nr_pat, Data, path_textfiles, zscore_cutoff, xaxis_cutoff, + # top_diseases, top_metab, output_dir ||| out: pdf file, Helix csv file + + if (violin == 1) { + + # preparation + zscore_patients_copy <- zscore_patients + # for robust scaler, rename Z-score columns + colnames(zscore_patients) <- gsub("_RobustZscore", "_Zscore", colnames(zscore_patients)) + colnames(zscore_controls) <- gsub("_RobustZscore", "_Zscore", colnames(zscore_controls)) + colnames(zscore_patients) <- gsub("_Zscore", "", colnames(zscore_patients)) + colnames(zscore_controls) <- gsub("_Zscore", "", colnames(zscore_controls)) + + # Make patient list for violin plots + patient_list <- names(zscore_patients)[-c(1, 2)] + + # from table expected_biomarkers, choose selected columns + select_columns <- c("Disease", "HMDB_code", "HMDB_name") + select_col_nrs <- which(colnames(expected_biomarkers) %in% select_columns) + expected_biomarkers_select <- expected_biomarkers[, select_col_nrs] + # remove duplicates + expected_biomarkers_select <- expected_biomarkers_select[!duplicated(expected_biomarkers_select[, c(1, 2)]), ] + + # load file with explanatory information to be included in PDF. + explanation <- readLines(file_explanation) + + # first step: normal violin plots + # Find all text files in the given folder, which contain metabolite lists of which + # each file will be a page in the pdf with violin plots. + # Make a PDF file for each of the categories in metabolite_dirs + metabolite_dirs <- list.files(path = path_metabolite_groups, full.names = FALSE, recursive = FALSE) + for (metabolite_dir in metabolite_dirs) { + # create a directory for the output PDFs + pdf_dir <- paste(output_dir, metabolite_dir, sep = "/") + dir.create(pdf_dir, showWarnings = FALSE) + cat("making plots in category:", metabolite_dir, "\n") + + # get a list of all metabolite files + metabolite_files <- list.files(path = paste(path_metabolite_groups, metabolite_dir, sep = "/"), + pattern = "*.txt", full.names = FALSE, recursive = FALSE) + # put all metabolites into one list + metab_list_all <- list() + metab_list_names <- c() + cat("making plots from the input files:") + # open the text files and add each to a list of dataframes (metab_list_all) + for (file_index in seq_along(metabolite_files)) { + infile <- metabolite_files[file_index] + metab_list <- read.table(paste(path_metabolite_groups, metabolite_dir, infile, sep = "/"), + sep = "\t", header = TRUE, quote = "") + # put into list of all lists + metab_list_all[[file_index]] <- metab_list + metab_list_names <- c(metab_list_names, strsplit(infile, ".txt")[[1]][1]) + cat(paste0("\n", infile)) + } + # include list of classes in metabolite list + names(metab_list_all) <- metab_list_names + + # prepare list of metabolites; max nr_plots_perpage on one page + metab_interest_sorted <- prepare_data(metab_list_all, zscore_patients) + metab_interest_controls <- prepare_data(metab_list_all, zscore_controls) + metab_perpage <- prepare_data_perpage(metab_interest_sorted, metab_interest_controls, nr_plots_perpage, nr_pat, nr_contr) + + # for Diagnostics metabolites to be saved in Helix + if(grepl("Diagnost", pdf_dir)) { + # get table that combines DIMS results with stofgroepen/Helix table + dims_helix_table <- get_patient_data_to_helix(metab_interest_sorted, metab_list_all) + + # check if run contains Diagnostics patients (e.g. "P2024M"), not for research runs + if(any(is_diagnostic_patient(dims_helix_table$Patient))){ + # get output file for Helix + output_helix <- output_for_helix(protocol_name, dims_helix_table) + # write output to file + path_helixfile <- paste0(output_dir, "/output_Helix_", run_name,".csv") + write.csv(output_helix, path_helixfile, quote = F, row.names = F) + } + } + + # make violin plots per patient + for (pt_nr in 1:length(patient_list)) { + pt_name <- patient_list[pt_nr] + # for category Diagnostics, make list of metabolites that exceed alarm values for this patient + # for category Other, make list of top highest and lowest Z-scores for this patient + if (grepl("Diagnost", pdf_dir)) { + top_metab_pt <- prepare_alarmvalues(pt_name, dims_helix_table) + } else { + top_metab_pt <- prepare_toplist(pt_name, zscore_patients) + } + + # generate normal violin plots + create_violin_plots(pdf_dir, pt_name, metab_perpage, top_metab_pt) + + } + + } + + # Second step: dIEM plots in separate directory + diem_plot_dir <- paste(output_dir, "dIEM_plots", sep = "/") + dir.create(diem_plot_dir) + + # Select the metabolites that are associated with the top highest scoring IEM, for each patient + # disease_rank is from step 4: the dIEM algorithm. The lower the value, the more likely. + for (pt_nr in 1:length(patient_list)) { + pt_name <- patient_list[pt_nr] + # get top diseases for this patient + pt_colnr <- which(colnames(disease_rank) == pt_name) + pt_top_indices <- which(disease_rank[, pt_colnr] <= top_nr_iem) + pt_iems <- disease_rank[pt_top_indices, "Disease"] + pt_top_iems <- pt_prob_score_top_iems <- c() + for (single_iem in pt_iems) { + # get the probability score + prob_score_iem <- prob_score[which(prob_score$Disease == single_iem), pt_colnr] + # use only diseases for which probability score is above threshold + if (prob_score_iem >= threshold_iem) { + pt_top_iems <- c(pt_top_iems, single_iem) + pt_prob_score_top_iems <- c(pt_prob_score_top_iems, prob_score_iem) + } + } + + # prepare data for plotting dIEM violin plots + # If prob_score_top_iem is an empty list, don't make a plot + if (length(pt_top_iems) > 0) { + # Sorting from high to low, both prob_score_top_iems and pt_top_iems. + pt_prob_score_order <- order(-pt_prob_score_top_iems) + pt_prob_score_top_iems <- round(pt_prob_score_top_iems, 1) + pt_prob_score_top_iem_sorted <- pt_prob_score_top_iems[pt_prob_score_order] + pt_top_iem_sorted <- pt_top_iems[pt_prob_score_order] + # getting metabolites for each top_iem disease exactly like in metab_list_all + metab_iem_all <- list() + metab_iem_names <- c() + for (single_iem_index in 1:length(pt_top_iem_sorted)) { + single_iem <- pt_top_iem_sorted[single_iem_index] + single_prob_score <- pt_prob_score_top_iem_sorted[single_iem_index] + select_rows <- which(expected_biomarkers_select$Disease == single_iem) + metab_list <- expected_biomarkers_select[select_rows, ] + metab_iem_names <- c(metab_iem_names, paste0(single_iem, ", probability score ", single_prob_score)) + metab_list <- metab_list[, -1] + metab_iem_all[[single_iem_index]] <- metab_list + } + # put all metabolites into one list + names(metab_iem_all) <- metab_iem_names + + # get Zscore information from zscore_patients_copy, similar to normal violin plots + metab_iem_sorted <- prepare_data(metab_iem_all, zscore_patients_copy) + metab_iem_controls <- prepare_data(metab_iem_all, zscore_controls) + # make sure every page has 20 metabolites + diem_metab_perpage <- prepare_data_perpage(metab_iem_sorted, metab_iem_controls, nr_plots_perpage, nr_pat) + + # generate dIEM violin plots + create_violin_plots(diem_plot_dir, pt_name, diem_metab_perpage, top_metab_pt) + + } else { + cat(paste0("\n\n**** This patient had no prob_scores higher than ", threshold_iem, ". + Therefore, this pdf was not made:\t ", pt_name, "_iem \n")) + } + + } + + } +} \ No newline at end of file diff --git a/DIMS/GenerateViolinPlots.nf b/DIMS/GenerateViolinPlots.nf new file mode 100644 index 0000000..44323ac --- /dev/null +++ b/DIMS/GenerateViolinPlots.nf @@ -0,0 +1,27 @@ +process GenerateViolinPlots { + tag "DIMS GenerateViolinPlots" + label 'GenerateViolinPlots' + container = 'docker://umcugenbioinf/dims:1.3' + shell = ['/bin/bash', '-euo', 'pipefail'] + + input: + path(excel_file) + val(analysis_id) + + output: + path('Diagnostics/*.pdf'), emit: diag_plot_files, optional: true + path('Other/*.pdf'), emit: other_plot_files, optional: true + path('dIEM_plots/*.pdf'), emit: diem_plot_files, optional: true + path('*.xlsx'), emit: excel_file, optional: true + path('*.csv'), emit: helix_file, optional: true + + script: + """ + Rscript ${baseDir}/CustomModules/DIMS/GenerateViolinPlots.R $analysis_id $params.scripts_dir $params.zscore \ + $params.path_metabolite_groups \ + $params.file_ratios_metabolites \ + $params.file_expected_biomarkers_IEM \ + $params.file_explanation \ + $params.file_isomers + """ +} diff --git a/DIMS/HMDBparts.R b/DIMS/HMDBparts.R new file mode 100644 index 0000000..3e7356d --- /dev/null +++ b/DIMS/HMDBparts.R @@ -0,0 +1,110 @@ +# adapted from hmdb_parts.R + +# define parameters +cmd_args <- commandArgs(trailingOnly = TRUE) + +db_file <- cmd_args[1] +breaks_file <- cmd_args[2] +standard_run <- cmd_args[4] + +# load file with binning breaks +load(breaks_file) +min_mz <- round(breaks_fwhm[1]) +max_mz <- round(breaks_fwhm[length(breaks_fwhm)]) + +# In case of a standard run use external HMDB parts +# m/z is approximately 70 to 600: set limits between 68-71 for min and 599-610 for max +if (standard_run == "yes" & min_mz > 68 & min_mz < 71 & max_mz > 599 & max_mz < 610) { + # skip generating HMDB parts + hmdb_parts_path <- cmd_args[3] + # find all files containing hmdb in file name + hmdb_parts <- list.files(hmdb_parts_path, pattern = "hmdb") + for (hmdb_file in hmdb_parts) { + file.copy(paste(hmdb_parts_path, hmdb_file, sep = "/"), "./", recursive = TRUE) + } +} else { + # generate HMDB parts in case of non-standard mz range + load(db_file) + ppm <- as.numeric(cmd_args[5]) + + scanmodes <- c("positive", "negative") + for (scanmode in scanmodes) { + if (scanmode == "negative") { + column_label <- "MNeg" + hmdb_add_iso <- hmdb_add_iso.Neg + } else if (scanmode == "positive") { + column_label <- "Mpos" + hmdb_add_iso <- hmdb_add_iso.Pos + } + + # filter mass range meassured + hmdb_add_iso = hmdb_add_iso[which(hmdb_add_iso[ , column_label] >= breaks_fwhm[1] & + hmdb_add_iso[ , column_label] <= breaks_fwhm[length(breaks_fwhm)]), ] + + # sort on mass + outlist <- hmdb_add_iso[order(as.numeric(hmdb_add_iso[ , column_label])),] + nr_rows <- dim(outlist)[1] + + # maximum number of rows per file + sub <- 20000 + end <- 0 + last_line <- sub + check <- 0 + outlist_part <- NULL + + # create parts and save to file + if (nr_rows < sub) { + outlist_part <- outlist + save(outlist_part, file = paste0(scanmode, "_hmdb.1.RData")) + } else if (nr_rows >= sub & (floor(nr_rows / sub) - 1) >= 2) { + for (i in 2:floor(nr_rows / sub) - 1) { + start <- -(sub - 1) + i * sub + end <- i * sub + + if (i > 1){ + outlist_i = outlist[c(start:end),] + nr_moved = 0 + # Use ppm to replace border to avoid cut within peak group + while ((as.numeric(outlist_i[1, column_label]) - as.numeric(outlist_part[last_line, column_label])) * 1e+06 / + as.numeric(outlist_i[1, column_label]) < ppm) { + outlist_part <- rbind(outlist_part, outlist_i[1, ]) + outlist_i <- outlist_i[-1, ] + nr_moved <- nr_moved + 1 + } + + save(outlist_part, file = paste(scanmode, "_", paste("hmdb", i-1, "RData", sep = "."), sep = "")) + check <- check + dim(outlist_part)[1] + + outlist_part <- outlist_i + last_line <- dim(outlist_part)[1] + + } else { + outlist_part <- outlist[c(start:end),] + } + } + + start <- end + 1 + end <- nr_rows + outlist_i <- outlist[c(start:end), ] + nr_moved <- 0 + + if (!is.null(outlist_part)) { + # Calculate ppm and replace border, avoid cut within peak group + while ((as.numeric(outlist_i[1, column_label]) - as.numeric(outlist_part[last_line, column_label])) * 1e+06 / + as.numeric(outlist_i[1, column_label]) < ppm) { + outlist_part <- rbind(outlist_part, outlist_i[1, ]) + outlist_i <- outlist_i[-1, ] + nr_moved <- nr_moved + 1 + } + + save(outlist_part, file = paste0(scanmode, "_hmdb_", i, ".RData")) + check <- check + dim(outlist_part)[1] + } + + outlist_part <- outlist_i + save(outlist_part, file = paste0(scanmode, "_hmdb_", i + 1, ".RData")) + check <- check + dim(outlist_part)[1] + } + } +} + diff --git a/DIMS/HMDBparts.nf b/DIMS/HMDBparts.nf new file mode 100644 index 0000000..5f19f75 --- /dev/null +++ b/DIMS/HMDBparts.nf @@ -0,0 +1,18 @@ +process HMDBparts { + tag "DIMS HMDBparts" + label 'HMDBparts' + container = 'docker://umcugenbioinf/dims:1.3' + shell = ['/bin/bash', '-euo', 'pipefail'] + + input: + path(hmdb_db_file) + path(breaks_file) + + output: + path('*.RData') + + script: + """ + Rscript ${baseDir}/CustomModules/DIMS/HMDBparts.R $hmdb_db_file $breaks_file $params.hmdb_parts_files $params.standard_run $params.ppm + """ +} diff --git a/DIMS/HMDBparts_main.R b/DIMS/HMDBparts_main.R new file mode 100644 index 0000000..14335bf --- /dev/null +++ b/DIMS/HMDBparts_main.R @@ -0,0 +1,56 @@ +## adapted from hmdb_part_adductSums.R + +# define parameters +cmd_args <- commandArgs(trailingOnly = TRUE) + +db_file <- cmd_args[1] +breaks_file <- cmd_args[2] + +load(db_file) +load(breaks_file) + +# Cut up HMDB minus adducts minus isotopes into small parts +scanmodes <- c("positive", "negative") +for (scanmode in scanmodes) { + if (scanmode == "negative") { + column_label <- "MNeg" + HMDB_add_iso <- HMDB_add_iso.Neg + } else if (scanmode == "positive") { + column_label <- "Mpos" + HMDB_add_iso <- HMDB_add_iso.Pos + } + + # filter mass range measured + outlist <- HMDB_add_iso[which(HMDB_add_iso[ , column_label] >= breaks_fwhm[1] & + HMDB_add_iso[ ,column_label] <= breaks_fwhm[length(breaks_fwhm)]), ] + + # remove adducts and isotopes, put internal standard at the beginning + outlist <- outlist[grep("HMDB", rownames(outlist), fixed = TRUE), ] + outlist <- outlist[-grep("_", rownames(outlist), fixed = TRUE), ] + # sort on m/z value + outlist <- outlist[order(outlist[ , column_label]), ] + nr_rows <- dim(outlist)[1] + + # size of hmdb parts in lines: + sub <- 1000 + end <- 0 + check <- 0 + + # generate hmdb parts + if (nr_rows >= sub & (floor(nr_rows / sub)) >= 2) { + for (i in 1:floor(nr_rows / sub)) { + start <- -(sub - 1) + i * sub + end <- i * sub + outlist_part <- outlist[c(start:end), ] + save(outlist_part, file=paste0(scanmode, "_hmdb_main.", i, ".RData")) + } + } + + # finish last hmdb part + start <- end + 1 + end <- nr_rows + + outlist_part <- outlist[c(start:end), ] + save(outlist_part, file = paste0(scanmode, "_hmdb_main.", i + 1, ".RData")) + +} \ No newline at end of file diff --git a/DIMS/HMDBparts_main.nf b/DIMS/HMDBparts_main.nf new file mode 100644 index 0000000..b38bac0 --- /dev/null +++ b/DIMS/HMDBparts_main.nf @@ -0,0 +1,19 @@ +process HMDBparts_main { + tag "DIMS HMDBparts_main" + label 'HMDBparts_main' + container = 'docker://umcugenbioinf/dims:1.3' + shell = ['/bin/bash', '-euo', 'pipefail'] + + input: + path(hmdb_db_file) + path(breaks_file) + + output: + path('*.RData') + + script: + + """ + Rscript ${baseDir}/CustomModules/DIMS/HMDBparts_main.R $hmdb_db_file $breaks_file + """ +} diff --git a/DIMS/MakeInit.R b/DIMS/MakeInit.R new file mode 100644 index 0000000..44d4996 --- /dev/null +++ b/DIMS/MakeInit.R @@ -0,0 +1,31 @@ +## adapted from makeInit in old pipeline + +# define parameters +args <- commandArgs(trailingOnly = TRUE) + +sample_sheet <- read.csv(args[1], sep = "\t") +nr_replicates <- as.numeric(args[2]) + +sample_names <- trimws(as.vector(unlist(sample_sheet[1]))) +nr_sample_groups <- length(sample_names) / nr_replicates +group_names <- trimws(as.vector(unlist(sample_sheet[2]))) +group_names <- gsub("[^-.[:alnum:]]", "_", group_names) +group_names_unique <- unique(group_names) + +# generate the replication pattern +repl_pattern <- c() +for (sample_group in 1:nr_sample_groups) { + tmp <- c() + for (repl in nr_replicates:1) { + index <- ((sample_group * nr_replicates) - repl) + 1 + tmp <- c(tmp, sample_names[index]) + } + repl_pattern <- c(repl_pattern, list(tmp)) +} + +names(repl_pattern) <- group_names_unique + +# preview the replication pattern +print(tail(repl_pattern)) + +save(repl_pattern, file = "init.RData") diff --git a/DIMS/MakeInit.nf b/DIMS/MakeInit.nf new file mode 100644 index 0000000..7aae0e4 --- /dev/null +++ b/DIMS/MakeInit.nf @@ -0,0 +1,18 @@ +process MakeInit { + tag "DIMS MakeInit" + label 'MakeInit' + container = 'docker://umcugenbioinf/dims:1.3' + shell = ['/bin/bash', '-euo', 'pipefail'] + + input: + path(samplesheet) + val(nr_replicates) + + output: + path('init.RData') + + script: + """ + Rscript ${baseDir}/CustomModules/DIMS/MakeInit.R $samplesheet $nr_replicates + """ +} diff --git a/DIMS/PeakFinding.R b/DIMS/PeakFinding.R new file mode 100644 index 0000000..697f8f7 --- /dev/null +++ b/DIMS/PeakFinding.R @@ -0,0 +1,53 @@ +## adapted from 4-peakFinding.R + +# define parameters +cmd_args <- commandArgs(trailingOnly = TRUE) + +sample_file <- cmd_args[1] +breaks_file <- cmd_args[2] +resol <- as.numeric(cmd_args[3]) +scripts_dir <- cmd_args[4] +thresh <- 2000 +outdir <- "./" + +# load in function scripts +source(paste0(scripts_dir, "do_peakfinding.R")) +source(paste0(scripts_dir, "check_overlap.R")) +source(paste0(scripts_dir, "search_mzrange.R")) +source(paste0(scripts_dir, "fit_optim.R")) +source(paste0(scripts_dir, "fit_gaussian.R")) +source(paste0(scripts_dir, "fit_init.R")) +source(paste0(scripts_dir, "get_fwhm.R")) +source(paste0(scripts_dir, "get_stdev.R")) +source(paste0(scripts_dir, "optimize_gaussfit.R")) +source(paste0(scripts_dir, "fit_peaks.R")) +source(paste0(scripts_dir, "fit_gaussians.R")) +source(paste0(scripts_dir, "estimate_area.R")) +source(paste0(scripts_dir, "get_fit_quality.R")) +source(paste0(scripts_dir, "check_overlap.R")) +source(paste0(scripts_dir, "sum_curves.R")) +source(paste0(scripts_dir, "within_ppm.R")) + +load(breaks_file) + +# Load output of AverageTechReplicates for a sample +sample_avgtechrepl <- get(load(sample_file)) +if (grepl("_pos", sample_file)) { + scanmode <- "positive" +} else if (grepl("_neg", sample_file)) { + scanmode <- "negative" +} + +# Initialize +options(digits = 16) +# Number used to calculate area under Gaussian curve +int_factor <- 1 * 10^5 +# Initial value used to estimate scaling parameter +scale <- 2 +width <- 1024 +height <- 768 + +# run the findPeaks function + +# do_peakfinding(sample_avgtechrepl, breaks_fwhm, int_factor, scale, resol, outdir, scanmode, FALSE, thresh, width, height) +do_peakfinding(sample_avgtechrepl, int_factor, scale, resol, outdir, scanmode, FALSE, thresh, width, height) diff --git a/DIMS/PeakFinding.nf b/DIMS/PeakFinding.nf new file mode 100644 index 0000000..0097d12 --- /dev/null +++ b/DIMS/PeakFinding.nf @@ -0,0 +1,17 @@ +process PeakFinding { + tag "DIMS PeakFinding ${rdata_file}" + label 'PeakFinding' + container = 'docker://umcugenbioinf/dims:1.3' + shell = ['/bin/bash', '-euo', 'pipefail'] + + input: + tuple(path(rdata_file), path(breaks_file)) + + output: + path '*tive.RData' + + script: + """ + Rscript ${baseDir}/CustomModules/DIMS/PeakFinding.R $rdata_file $breaks_file $params.resolution $params.scripts_dir + """ +} diff --git a/DIMS/PeakGrouping.R b/DIMS/PeakGrouping.R new file mode 100644 index 0000000..dc29e76 --- /dev/null +++ b/DIMS/PeakGrouping.R @@ -0,0 +1,198 @@ +# adapted from 6-peakGrouping.R + +# define parameters +cmd_args <- commandArgs(trailingOnly = TRUE) + +hmdb_part_file <- cmd_args[1] +ppm <- as.numeric(cmd_args[2]) + +options(digits = 16) + +# load part of the HMDB +hmdb_add_iso <- get(load(hmdb_part_file)) + +# determine appropriate scanmode based on hmdb_part_file +if (grepl("negative", basename(hmdb_part_file))) { + scanmode <- "negative" +} else if (grepl("positive", basename(hmdb_part_file))) { + scanmode <- "positive" +} + +# determine batch number of HMDB part file +batch_number <- strsplit(basename(hmdb_part_file), ".", fixed = TRUE)[[1]][2] + +# load file with spectrum peaks +spec_peaks_file <- paste0("SpectrumPeaks_", scanmode, ".RData") +load(spec_peaks_file) +outlist_copy <- outlist_total +rm(outlist_total) + +# load replication pattern +pattern_file <- paste0(scanmode, "_repl_pattern.RData") +load(pattern_file) + +# determine appropriate column name in HMDB part +if (scanmode == "negative") { + column_label <- "MNeg" +} else { + column_label <- "Mpos" +} + +# Initialize +peakgrouplist_identified <- NULL +list_of_peaks_used_in_peak_groups_identified <- NULL + +# First find peak groups identified based on HMDB masses +while (dim(hmdb_add_iso)[1] > 0) { + index <- 1 + + # take one m/z value from the HMDB part and calculate mass tolerance + reference_mass <- as.numeric(hmdb_add_iso[index, column_label]) + mass_tolerance <- (reference_mass * ppm) / 10^6 + + # find the peaks in the dataset with corresponding m/z + mzmed <- as.numeric(outlist_copy[, "mzmed.pkt"]) + selp <- which((mzmed > (reference_mass - mass_tolerance)) & (mzmed < (reference_mass + mass_tolerance))) + tmplist <- outlist_copy[selp, , drop = FALSE] + list_of_peaks_used_in_peak_groups_identified <- rbind(list_of_peaks_used_in_peak_groups_identified, tmplist) + nrsamples <- length(selp) + # if peaks have been found, create a peak group + if (nrsamples > 0) { + mzmed_pgrp <- mean(as.numeric(outlist_copy[selp, "mzmed.pkt"])) + mzmin_pgrp <- reference_mass - mass_tolerance + mzmax_pgrp <- reference_mass + mass_tolerance + + # determine fit quality fq + fq_worst_pgrp <- as.numeric(max(outlist_copy[selp, "fq"])) + fq_best_pgrp <- as.numeric(min(outlist_copy[selp, "fq"])) + + # set up object for intensities for all samples + ints_allsamps <- rep(0, length(names(repl_pattern_filtered))) + names(ints_allsamps) <- names(repl_pattern_filtered) + + # Check for each sample if multiple peaks exist, if so take the sum of the intensities + labels <- unique(tmplist[, "samplenr"]) + ints_allsamps[labels] <- as.vector(unlist(lapply(labels, function(x) { + sum(as.numeric(tmplist[which(tmplist[, "samplenr"] == x), "height.pkt"])) + }))) + + # Initialize + assi_hmdb <- iso_hmdb <- hmdb_code <- NA + tmplist_mass_iso <- tmplist_mass_adduct <- NULL + + # find all entries in HMDB part with mass within ppm range + mass_all <- as.numeric(hmdb_add_iso[, column_label]) + index <- which((mass_all > (reference_mass - mass_tolerance)) & (mass_all < (reference_mass + mass_tolerance))) + tmplist_mass <- hmdb_add_iso[index, , drop = FALSE] + + if (dim(tmplist_mass)[1] > 0) { + # find isotope entries + index_iso <- grep(" iso ", tmplist_mass[, "CompoundName"], fixed = TRUE) + if (length(index_iso) > 0) { + tmplist_mass_iso <- tmplist_mass[index_iso, , drop = FALSE] + tmplist_mass <- tmplist_mass[-index_iso, , drop = FALSE] + } + + if (dim(tmplist_mass)[1] > 0) { + # find adduct entries + index_adduct <- grep(" [M", tmplist_mass[, "CompoundName"], fixed = TRUE) + if (length(index_adduct) > 0) { + tmplist_mass_adduct <- tmplist_mass[index_adduct, , drop = FALSE] + tmplist_mass <- tmplist_mass[-index_adduct, , drop = FALSE] + } + } + + # Compose a list of compounds, adducts or isotopes with corresponding m/z + if (dim(tmplist_mass)[1] > 0) { + # metabolites + assi_hmdb <- as.character(paste(as.character(tmplist_mass[, "CompoundName"]), + collapse = ";")) + hmdb_code <- as.character(paste(as.character(rownames(tmplist_mass)), + collapse = ";")) + theormz_hmdb <- as.numeric(tmplist_mass[1, column_label]) + + # adducts of metabolites + if (!is.null(tmplist_mass_adduct)) { + if (dim(tmplist_mass_adduct)[1] > 0) { + if (is.na(assi_hmdb)) { + assi_hmdb <- as.character(paste(as.character(tmplist_mass_adduct[, "CompoundName"]), + collapse = ";")) + hmdb_code <- as.character(paste(as.character(rownames(tmplist_mass_adduct)), + collapse = ";")) + } else { + assi_hmdb <- paste(assi_hmdb, + as.character(paste(as.character(tmplist_mass_adduct[, "CompoundName"]), + collapse = ";")), sep = ";") + hmdb_code <- paste(hmdb_code, + as.character(paste(as.character(rownames(tmplist_mass_adduct)), + collapse = ";")), sep = ";") + } + } + } + + # isotopes of metabolites + if (!is.null(tmplist_mass_iso)) { + if (dim(tmplist_mass_iso)[1] > 0) { + iso_hmdb <- as.character(paste(as.character(tmplist_mass_iso[, "CompoundName"]), + collapse = ";")) + } + } + + # if no metabolites have the correct m/z, look for adducts and isotopes only + } else if (!is.null(tmplist_mass_adduct)) { + theormz_hmdb <- as.numeric(tmplist_mass_adduct[1, column_label]) + + # adducts of metabolites + if (!is.null(tmplist_mass_adduct)) { + if (dim(tmplist_mass_adduct)[1] > 0) { + if (is.na(assi_hmdb)) { + assi_hmdb <- as.character(paste(as.character(tmplist_mass_adduct[, "CompoundName"]), + collapse = ";")) + hmdb_code <- as.character(paste(as.character(rownames(tmplist_mass_adduct)), + collapse = ";")) + } else { + assi_hmdb <- paste(assi_hmdb, + as.character(paste(as.character(tmplist_mass_adduct[, "CompoundName"]), + collapse = ";")), sep = ";") + hmdb_code <- paste(hmdb_code, + as.character(paste(as.character(rownames(tmplist_mass_adduct)), + collapse = ";")), sep = ";") + } + } + } + + # isotopes of metabolites + if (!is.null(tmplist_mass_iso)) { + if (dim(tmplist_mass_iso)[1] > 0) { + iso_hmdb <- as.character(paste(as.character(tmplist_mass_iso[, "CompoundName"]), + collapse = ";")) + } + } + + # if no metabolites or adducts can be found, only look for isotopes + } else if (!is.null(tmplist_mass_iso)) { + if (dim(tmplist_mass_iso)[1] > 0) { + theormz_hmdb <- as.numeric(tmplist_mass_iso[1, column_label]) + iso_hmdb <- as.character(paste(as.character(tmplist_mass_iso[, "CompoundName"]), + collapse = ";")) + } + } + } + + # combine all information + peakgrouplist_identified <- rbind(peakgrouplist_identified, cbind( + data.frame("mzmed.pgrp" = mzmed_pgrp, "fq.best" = fq_best_pgrp, "fq.worst" = fq_worst_pgrp, nrsamples, + "mzmin.pgrp" = mzmin_pgrp, "mzmax.pgrp" = mzmax_pgrp), + t(as.matrix(ints_allsamps)), + data.frame("assi_HMDB" = assi_hmdb, "iso_HMDB" = iso_hmdb, "HMDB_code" = hmdb_code, "theormz_HMDB" = theormz_hmdb) + )) + } + + # remove index metabolite from HMDB part and continue while loop + hmdb_add_iso <- hmdb_add_iso[-index, ] +} + +# save peak list corresponding to masses in HMDB part +save(list_of_peaks_used_in_peak_groups_identified, file = paste0(batch_number, "_", scanmode, "_peaks_used.RData")) +# save peak group list, identified part +save(peakgrouplist_identified, file = paste0(batch_number, "_", scanmode, "_identified.RData")) diff --git a/DIMS/PeakGrouping.nf b/DIMS/PeakGrouping.nf new file mode 100644 index 0000000..ae4382c --- /dev/null +++ b/DIMS/PeakGrouping.nf @@ -0,0 +1,20 @@ +process PeakGrouping { + tag "DIMS PeakGrouping ${hmdbpart_file}" + label 'PeakGrouping' + container = 'docker://umcugenbioinf/dims:1.3' + shell = ['/bin/bash', '-euo', 'pipefail'] + + input: + path(hmdbpart_file) + each path(spectrumpeak_file) + each path(pattern_file) + + output: + path '*_peaks_used.RData', emit: peaks_used + path '*_identified.RData', emit: grouped_identified + + script: + """ + Rscript ${baseDir}/CustomModules/DIMS/PeakGrouping.R $hmdbpart_file $params.ppm + """ +} diff --git a/DIMS/SpectrumPeakFinding.R b/DIMS/SpectrumPeakFinding.R new file mode 100644 index 0000000..8b249e2 --- /dev/null +++ b/DIMS/SpectrumPeakFinding.R @@ -0,0 +1,61 @@ +## adapted from 5-collectSamples.R + +# define parameters +scanmodes <- c("positive", "negative") + +# Check whether all jobs terminated correctly +not_run <- NULL + +# collect spectrum peaks for each scanmode +for (scanmode in scanmodes) { + # load peak lists of all biological samples + peaklist_files <- list.files(pattern = paste0("_", scanmode, ".RData")) + + # get sample names + load(paste0(scanmode, "_repl_pattern.RData")) + group_names <- names(repl_pattern_filtered) + for (sample_nr in 1:length(group_names)) { + group <- paste0(group_names[sample_nr], "_", scanmode, ".RData") + if (!(group %in% peaklist_files)) { + not_run <- c(not_run, group) + } + } + + # Collecting samples + outlist_total <- NULL + for (file_nr in 1:length(peaklist_files)) { + cat("\n", peaklist_files[file_nr]) + load(peaklist_files[file_nr]) + if (is.null(outlist_persample) || (dim(outlist_persample)[1] == 0)) { + tmp <- strsplit(peaklist_files[file_nr], "/")[[1]] + fname <- tmp[length(tmp)] + fname <- strsplit(fname, ".RData")[[1]][1] + fname <- substr(fname, 13, nchar(fname)) + if (file_nr == 1) { + outlist_total <- c(fname, rep("-1", 5)) + } else { + outlist_total <- rbind(outlist_total, c(fname, rep("-1", 5))) + } + } else { + if (file_nr == 1) { + outlist_total <- outlist_persample + } else { + outlist_total <- rbind(outlist_total, outlist_persample) + } + } + } + + # remove negative values + index <- which(outlist_total[, "height.pkt"] <= 0) + if (length(index) > 0) outlist_total <- outlist_total[-index, ] + index <- which(outlist_total[, "mzmed.pkt"] <= 0) + if (length(index) > 0) outlist_total <- outlist_total[-index, ] + + save(outlist_total, file = paste0("./SpectrumPeaks_", scanmode, ".RData")) + + if (!is.null(not_run)) { + for (i in 1:length(not_run)) { + message(paste(not_run[i], "was not generated")) + } + } +} diff --git a/DIMS/SpectrumPeakFinding.nf b/DIMS/SpectrumPeakFinding.nf new file mode 100644 index 0000000..3590265 --- /dev/null +++ b/DIMS/SpectrumPeakFinding.nf @@ -0,0 +1,18 @@ +process SpectrumPeakFinding { + tag "DIMS SpectrumPeakFinding" + label 'SpectrumPeakFinding' + container = 'docker://umcugenbioinf/dims:1.3' + shell = ['/bin/bash', '-euo', 'pipefail'] + + input: + path(rdata_files) + path(replication_pattern) + + output: + path 'SpectrumPeaks_*.RData' + + script: + """ + Rscript ${baseDir}/CustomModules/DIMS/SpectrumPeakFinding.R + """ +} diff --git a/DIMS/SumAdducts.R b/DIMS/SumAdducts.R new file mode 100755 index 0000000..699da4a --- /dev/null +++ b/DIMS/SumAdducts.R @@ -0,0 +1,87 @@ +## adapted from 11-runSumAdducts.R + +# define parameters +cmd_args <- commandArgs(trailingOnly = TRUE) + +hmdbpart_main_file <- cmd_args[1] +scripts_dir <- cmd_args[2] +z_score <- as.numeric(cmd_args[3]) + +sum_adducts <- function(peaklist, theor_mz, grpnames_long, adducts, batch_number, scanmode, outdir, z_score) { + hmdb_codes <- rownames(theor_mz) + hmdb_names <- theor_mz[, 1, drop = FALSE] + hmdb_names[] <- lapply(hmdb_names, as.character) + + # remove isotopes + index <- grep("HMDB", hmdb_codes, fixed = TRUE) + hmdb_codes <- hmdb_codes[index] + hmdb_names <- hmdb_names[index, ] + index <- grep("_", rownames(hmdb_codes), fixed = TRUE) + if (length(index) > 0) hmdb_codes <- hmdb_codes[-index] + if (length(index) > 0) hmdb_names <- hmdb_names[-index] + + # negative + names <- NULL + adductsum <- NULL + names_long <- NULL + + if (length(hmdb_codes) != 0) { + for (i in 1:length(hmdb_codes)) { + compound <- hmdb_codes[i] + compound_plus <- c(compound, paste(compound, adducts, sep = "_")) + + metab <- unlist(lapply(peaklist$HMDB_code, function(x) { + (length(intersect(unlist(strsplit(as.vector(x), ";")), compound_plus)) > 0) + })) + + total <- c() + + # get the intensities for selected metabolite. + if (z_score == 1) { + int_cols_C <- grep("C", colnames(peaklist)[1:which(colnames(peaklist) == "avg.ctrls")]) + int_cols_P <- grep("P", colnames(peaklist)[1:which(colnames(peaklist) == "avg.ctrls")]) + int_cols <- c(int_cols_C, int_cols_P) + ints <- peaklist[metab, int_cols] + } else { + ints <- peaklist[metab, c(3:(length(grpnames_long) + 2))] + } + total <- apply(ints, 2, sum) + + if (sum(total) != 0) { + names <- c(names, compound) + adductsum <- rbind(adductsum, total) + names_long <- c(names_long, hmdb_names[i]) + } + } + + if (!is.null(adductsum)) { + rownames(adductsum) <- names + adductsum <- cbind(adductsum, "HMDB_name" = names_long) + save(adductsum, file = paste(scanmode, "_", batch_number, "_SummedAdducts.RData", sep = "")) + } + } +} + +if (grepl("positive_hmdb", hmdbpart_main_file)) { + scanmode <- "positive" + # for the adduct sum: include adducts M+Na (1) and M+K (2) + adducts <- c(1, 2) +} else if (grepl("negative_hmdb", hmdbpart_main_file)) { + scanmode <- "negative" + # for the adduct sum: include adduct M+Cl (1) + adducts <- c(1) +} + +# load input files +collect_file <- paste0("outlist_identified_", scanmode, ".RData") +load(collect_file) +repl_file <- paste0(scanmode, "_repl_pattern.RData") +load(repl_file) +outlist_part <- get(load(hmdbpart_main_file)) + +# get the number from the file name +batch_number <- strsplit(basename(hmdbpart_main_file), ".", fixed = TRUE)[[1]][2] + +outlist_total <- unique(outlist_ident) + +sum_adducts(outlist_total, outlist_part, names(repl_pattern_filtered), adducts, batch_number, scanmode, outdir, z_score) diff --git a/DIMS/SumAdducts.nf b/DIMS/SumAdducts.nf new file mode 100644 index 0000000..bbd6420 --- /dev/null +++ b/DIMS/SumAdducts.nf @@ -0,0 +1,19 @@ +process SumAdducts { + tag "DIMS SumAdducts" + label 'SumAdducts' + container = 'docker://umcugenbioinf/dims:1.3' + shell = ['/bin/bash', '-euo', 'pipefail'] + + input: + each path(collect_files) + each path(replication_pattern) + path(HMDBpart_main_file) + + output: + path('*_SummedAdducts.RData') + + script: + """ + Rscript ${baseDir}/CustomModules/DIMS/SumAdducts.R $HMDBpart_main_file $params.scripts_dir $params.zscore + """ +} diff --git a/DIMS/ThermoRawFileParser.nf b/DIMS/ThermoRawFileParser.nf new file mode 100644 index 0000000..022a2c9 --- /dev/null +++ b/DIMS/ThermoRawFileParser.nf @@ -0,0 +1,18 @@ +process ConvertRawFile { + tag "DIMS ConvertRawFile ${file_id}" + label 'ThermoRawFileParser_1_1_11' + shell = ['/bin/bash', '-euo', 'pipefail'] + + input: + tuple(val(file_id), path(raw_file)) + + output: + tuple(val(file_id), path("${file_id}.mzML")) + + script: + + """ + source /hpc/dbg_mz/tools/mono/etc/profile + mono /hpc/dbg_mz/tools/ThermoRawFileParser_1.1.11/ThermoRawFileParser.exe -i=${raw_file} --output=./ -p + """ +} diff --git a/DIMS/UnidentifiedCalcZscores.R b/DIMS/UnidentifiedCalcZscores.R new file mode 100755 index 0000000..ec31a3f --- /dev/null +++ b/DIMS/UnidentifiedCalcZscores.R @@ -0,0 +1,61 @@ +## adapted from 10-collectSamplesFilled.R + +# define parameters +cmd_args <- commandArgs(trailingOnly = TRUE) + +scripts_dir <- cmd_args[1] +ppm <- as.numeric(cmd_args[2]) +z_score <- as.numeric(cmd_args[3]) + +source(paste0(scripts_dir, "merge_duplicate_rows.R")) +source(paste0(scripts_dir, "calculate_zscores.R")) + +# for each scan mode, collect all filled peak group lists +scanmodes <- c("positive", "negative") + +for (scanmode in scanmodes) { + filled_files <- list.files("./", full.names = TRUE, pattern = paste0(scanmode, ".{1,}_Unidentified_filled")) + # load files and combine into one object + outlist_total <- NULL + for (file_nr in 1:length(filled_files)) { + peakgrouplist_filled <- get(load(filled_files[file_nr])) + outlist_total <- rbind(outlist_total, peakgrouplist_filled) + } + + # remove duplicates; peak groups with exactly the same m/z + outlist_total <- merge_duplicate_rows(outlist_total) + + # sort on mass + outlist_total <- outlist_total[order(outlist_total[, "mzmed.pgrp"]), ] + + # load replication pattern + pattern_file <- paste0(scanmode, "_repl_pattern.RData") + repl_pattern <- get(load(pattern_file)) + + if (z_score == 1) { + # calculate Z-scores + outlist_stats <- calculate_zscores(outlist_total, adducts = FALSE) + nr_removed_samples <- length(which(repl_pattern[] == "character(0)")) + order_index_int <- order(colnames(outlist_stats)[8:(length(repl_pattern) - nr_removed_samples + 7)]) + outlist_stats_more <- cbind(outlist_stats[, 1:7], + outlist_stats[, (length(repl_pattern) - nr_removed_samples + 8): + (length(repl_pattern) - nr_removed_samples + 8 + 6)], + outlist_stats[, 8:(length(repl_pattern) - nr_removed_samples + 7)][order_index_int], + outlist_stats[, (length(repl_pattern) - nr_removed_samples + 5 + 10):ncol(outlist_stats)]) + + tmp_index <- grep("_Zscore", colnames(outlist_stats_more), fixed = TRUE) + tmp_index_order <- order(colnames(outlist_stats_more[, tmp_index])) + tmp <- outlist_stats_more[, tmp_index[tmp_index_order]] + outlist_stats_more <- outlist_stats_more[, -tmp_index] + outlist_stats_more <- cbind(outlist_stats_more, tmp) + outlist_total <- outlist_stats_more + } + + outlist_not_ident <- outlist_total + + # Save output + remove_columns <- c("fq.best", "fq.worst", "mzmin.pgrp", "mzmax.pgrp") + remove_colindex <- which(colnames(outlist_not_ident) %in% remove_columns) + outlist_not_ident <- outlist_not_ident[, -remove_colindex] + save(outlist_not_ident, file = paste0("unidentified_outlist_", scanmode, ".RData")) +} diff --git a/DIMS/UnidentifiedCalcZscores.nf b/DIMS/UnidentifiedCalcZscores.nf new file mode 100644 index 0000000..675d9cd --- /dev/null +++ b/DIMS/UnidentifiedCalcZscores.nf @@ -0,0 +1,18 @@ +process UnidentifiedCalcZscores { + tag "DIMS UnidentifiedCalcZscores" + label 'UnidentifiedCalcZscores' + container = 'docker://umcugenbioinf/dims:1.3' + shell = ['/bin/bash', '-euo', 'pipefail'] + + input: + path(unidentified_filled_files) + each path(replication_pattern) + + output: + path('unidentified_outlist*.RData') + + script: + """ + Rscript ${baseDir}/CustomModules/DIMS/UnidentifiedCalcZscores.R $params.scripts_dir $params.ppm $params.zscore + """ +} diff --git a/DIMS/UnidentifiedCollectPeaks.R b/DIMS/UnidentifiedCollectPeaks.R new file mode 100755 index 0000000..1536213 --- /dev/null +++ b/DIMS/UnidentifiedCollectPeaks.R @@ -0,0 +1,68 @@ +## adapted from 7-collectSamplesGroupedHMDB.R + +# load required packages +suppressMessages(library("dplyr")) + +options(digits = 16) + +# define parameters +cmd_args <- commandArgs(trailingOnly = TRUE) + +ppm <- as.numeric(cmd_args[1]) + +scanmodes <- c("positive", "negative") + +for (scanmode in scanmodes) { + # get list of all files that contain lists of peaks that were used in identified peak grouping + files <- list.files("./", pattern = paste(scanmode, "_peaks_used.RData", sep = "")) + # load the list of all peaks + load(paste0("SpectrumPeaks_", scanmode, ".RData")) + + # Make a list of indexes of peaks that have been identified, then remove these from the peaklist. + remove <- NULL + for (file_index in 1:length(files)) { + # load list_of_peaks_used_in_peak_groups_identified + load(files[file_index]) + remove <- c(remove, + which(outlist_total[, "mzmed.pkt"] %in% list_of_peaks_used_in_peak_groups_identified[, "mzmed.pkt"])) # nolint + } + outlist_rest <- outlist_total[-remove, ] + + # sort on mass + outlist <- outlist_rest[order(as.numeric(outlist_rest[, "mzmed.pkt"])), ] + # save output + # save(outlist, file = paste0("SpectrumPeaks_", scanmode, "_Unidentified.RData")) + + size_parts <- 10000 + # start <- 1 + + num_parts <- ceiling(nrow(outlist) / size_parts) + + for (part in 1:num_parts){ + if (part == 1) { + start_part <- 1 + end_part <- size_parts + } else { + start_part <- (part - 1) * size_parts + 1 + } + if (part == num_parts) { + end_part <- nrow(outlist) + } else if (part != 1) { + end_part <- part * size_parts + } + + outlist_part <- outlist[c(start_part:end_part), ] + + # add ppm extra after end + if (part != num_parts) { + mz_end <- outlist_part[nrow(outlist_part), "mzmed.pkt"] + mz_ppm_range <- ppm * as.numeric(mz_end) / 1e+06 + mz_end_plus_ppm <- as.numeric(mz_end) + mz_ppm_range + outlist_after_part <- as.data.frame(outlist) %>% filter(mzmed.pkt > mz_end & mzmed.pkt <= mz_end_plus_ppm) + + outlist_part <- rbind(outlist_part, outlist_after_part) + } + + save(outlist_part, file = paste0(scanmode, "_", paste0("unidentified_part_", part, ".RData"))) + } +} diff --git a/DIMS/UnidentifiedCollectPeaks.nf b/DIMS/UnidentifiedCollectPeaks.nf new file mode 100644 index 0000000..0983e9e --- /dev/null +++ b/DIMS/UnidentifiedCollectPeaks.nf @@ -0,0 +1,18 @@ +process UnidentifiedCollectPeaks { + tag "DIMS UnidentifiedCollectPeaks" + label 'UnidentifiedCollectPeaks' + container = 'docker://umcugenbioinf/dims:1.3' + shell = ['/bin/bash', '-euo', 'pipefail'] + + input: + path(spectrumpeaks_file) + path(peaklist_identified) + + output: + path('*.RData') + + script: + """ + Rscript ${baseDir}/CustomModules/DIMS/UnidentifiedCollectPeaks.R $params.ppm + """ +} diff --git a/DIMS/UnidentifiedFillMissing.R b/DIMS/UnidentifiedFillMissing.R new file mode 100755 index 0000000..78b7951 --- /dev/null +++ b/DIMS/UnidentifiedFillMissing.R @@ -0,0 +1,48 @@ +# adapted from 9-runFillMissing.R + +# define parameters +cmd_args <- commandArgs(trailingOnly = TRUE) + +peakgrouplist_file <- cmd_args[1] +# peakgrouplist_file2 <- cmd_args[2] +scripts_dir <- cmd_args[2] +thresh <- as.numeric(cmd_args[3]) +resol <- as.numeric(cmd_args[4]) +ppm <- as.numeric(cmd_args[5]) +outdir <- "./" + +# load in function scripts +source(paste0(scripts_dir, "replace_zeros.R")) +source(paste0(scripts_dir, "fit_optim.R")) +source(paste0(scripts_dir, "get_fwhm.R")) +source(paste0(scripts_dir, "get_stdev.R")) +source(paste0(scripts_dir, "estimate_area.R")) +source(paste0(scripts_dir, "optimize_gaussfit.R")) +source(paste0(scripts_dir, "identify_noisepeaks.R")) +source(paste0(scripts_dir, "get_element_info.R")) +source(paste0(scripts_dir, "atomic_info.R")) + +# peakgrouplist_files <- c(peakgrouplist_file1, peakgrouplist_file2) +# for (peakgrouplist_file in peakgrouplist_files) { + +if (grepl("_pos", basename(peakgrouplist_file))) { + scanmode <- "positive" +} else if (grepl("_neg", basename(peakgrouplist_file))) { + scanmode <- "negative" +} + +# get replication pattern for sample names +pattern_file <- paste0(scanmode, "_repl_pattern.RData") +repl_pattern <- get(load(pattern_file)) + +# load peak group list and determine output file name +outpgrlist_identified <- get(load(peakgrouplist_file)) + +outputfile_name <- gsub(".RData", "_filled.RData", peakgrouplist_file) + +# replace missing values (zeros) with random noise +peakgrouplist_filled <- replace_zeros(outpgrlist_identified, repl_pattern, scanmode, resol, outdir, thresh, ppm) + +# save output +save(peakgrouplist_filled, file = outputfile_name) +# } diff --git a/DIMS/UnidentifiedFillMissing.nf b/DIMS/UnidentifiedFillMissing.nf new file mode 100644 index 0000000..3f9098e --- /dev/null +++ b/DIMS/UnidentifiedFillMissing.nf @@ -0,0 +1,18 @@ +process UnidentifiedFillMissing { + tag "DIMS UnidentifiedFillMissing ${groupedlist_file}" + label 'UnidentifiedFillMissing' + container = 'docker://umcugenbioinf/dims:1.3' + shell = ['/bin/bash', '-euo', 'pipefail'] + + input: + path(groupedlist_file) + each path(replication_pattern) + + output: + path('*_filled.RData') + + script: + """ + Rscript ${baseDir}/CustomModules/DIMS/UnidentifiedFillMissing.R $groupedlist_file $params.scripts_dir $params.thresh $params.resolution $params.ppm + """ +} diff --git a/DIMS/UnidentifiedPeakGrouping.R b/DIMS/UnidentifiedPeakGrouping.R new file mode 100755 index 0000000..edeb2f7 --- /dev/null +++ b/DIMS/UnidentifiedPeakGrouping.R @@ -0,0 +1,108 @@ +## adapted from 8-peakGrouping.rest.R + +# define parameters +cmd_args <- commandArgs(trailingOnly = TRUE) + +unidentified_peaklist <- cmd_args[1] +resol <- as.numeric(cmd_args[2]) +ppm <- as.numeric(cmd_args[3]) +outdir <- "./" + +options(digits = 16) + +# function for grouping unidentified peaks +grouping_rest <- function(outdir, unidentified_peaklist, scanmode, ppm) { + outlist_copy <- get(load(unidentified_peaklist)) + load(paste0(scanmode, "_repl_pattern.RData")) + outpgrlist <- NULL + + # group on highest peaks + range <- ppm * 1e-06 + + # temporary: speed up this step by limiting the number of rows used in while loop + # this script needs to be parallellized + nrow_div <- nrow(outlist_copy) / 1.1 + + # while (dim(outlist_copy)[1] > 0) { + while (dim(outlist_copy)[1] > nrow_div) { + select_max_height <- which(as.numeric(outlist_copy[, "height.pkt"]) == max(as.numeric(outlist_copy[, "height.pkt"])))[1] + + # ppm range around max + mzref <- as.numeric(outlist_copy[select_max_height, "mzmed.pkt"]) + pkmin <- -(range * mzref - mzref) + pkmax <- 2 * mzref - pkmin + + select_peaks_within_range <- as.numeric(outlist_copy[, "mzmed.pkt"]) > pkmin & as.numeric(outlist_copy[, "mzmed.pkt"]) < pkmax + tmplist <- outlist_copy[select_peaks_within_range, , drop = FALSE] + + nrsamples <- length(unique(tmplist[, "samplenr"])) + if (nrsamples > 0) { + mzmed_pgrp <- mean(as.numeric(outlist_copy[select_peaks_within_range, "mzmed.pkt"])) + mzmin_pgrp <- -(range * mzmed_pgrp - mzmed_pgrp) + mzmax_pgrp <- 2 * mzmed_pgrp - mzmin_pgrp + # select peaks within mz range + select_peaks_within_range <- as.numeric(outlist_copy[, "mzmed.pkt"]) > mzmin_pgrp & as.numeric(outlist_copy[, "mzmed.pkt"]) < mzmax_pgrp + tmplist <- outlist_copy[select_peaks_within_range, , drop = FALSE] + + # remove used peaks + tmp <- as.vector(which(tmplist[, "height.pkt"] == -1)) + if (length(tmp) > 0) tmplist <- tmplist[-tmp, , drop = FALSE] + + nrsamples <- length(unique(tmplist[, "samplenr"])) + fq_worst_pgrp <- as.numeric(max(outlist_copy[select_peaks_within_range, "fq"])) + fq_best_pgrp <- as.numeric(min(outlist_copy[select_peaks_within_range, "fq"])) + ints_allsamps <- rep(0, length(names(repl_pattern_filtered))) + names(ints_allsamps) <- names(repl_pattern_filtered) + + # Check for each sample if multiple peaks exists, if so take the sum + labels <- unique(tmplist[, "samplenr"]) + ints_allsamps[labels] <- as.vector(unlist(lapply(labels, function(x) { + sum(as.numeric(tmplist[which(tmplist[, "samplenr"] == x), "height.pkt"])) + }))) + + # combine all information + outpgrlist <- rbind(outpgrlist, c(mzmed_pgrp, fq_best_pgrp, fq_worst_pgrp, nrsamples, + mzmin_pgrp, mzmax_pgrp, ints_allsamps, NA, NA, NA, NA)) + } + + outlist_copy <- outlist_copy[-which(select_peaks_within_range == TRUE), , drop = FALSE] + } + + outpgrlist <- as.data.frame(outpgrlist) + colnames(outpgrlist)[1:6] <- c("mzmed.pgrp", "fq.best", "fq.worst", "nrsamples", "mzmin.pgrp", "mzmax.pgrp") + colnames(outpgrlist)[(length(repl_pattern_filtered) + 7):ncol(outpgrlist)] <- c("assi_HMDB", "iso_HMDB", + "HMDB_code", "theormz_HMDB") + + return(outpgrlist) +} + +# scanmodes <- c("positive", "negative") + +# for (scanmode in scanmodes) { +# # generate peak group lists of the unidentified peaks +# unidentified_peaklist <- paste0("SpectrumPeaks_", scanmode, "_Unidentified.RData") +# outpgrlist <- grouping_rest(outdir, unidentified_peaklist, scanmode, ppm = ppm) +# write.table(outpgrlist, file = paste0("PeakGroupList_", scanmode, "_Unidentified.txt")) + +# # save output in RData format for further processing +# save(outpgrlist, file=paste0("PeakGroupList_", scanmode, "_Unidentified.RData")) +# } + +# determine appropriate scanmode based on unidentified_peaklist file +if (grepl("negative", basename(unidentified_peaklist))) { + scanmode <- "negative" +} else if (grepl("positive", basename(unidentified_peaklist))) { + scanmode <- "positive" +} + +# generate peak group lists of the unidentified peaks +outpgrlist <- grouping_rest(outdir, unidentified_peaklist, scanmode, ppm = ppm) + +# determine part number of unidentified_peaklist file +part_number <- gsub("\\D", "", basename(unidentified_peaklist)) + +# save output in txt format +write.table(outpgrlist, file = paste0("PeakGroupList_", scanmode, "_part_", part_number, "_Unidentified.txt")) + +# save output in RData format for further processing +save(outpgrlist, file=paste0("PeakGroupList_", scanmode, "_part_", part_number, "_Unidentified.RData")) diff --git a/DIMS/UnidentifiedPeakGrouping.nf b/DIMS/UnidentifiedPeakGrouping.nf new file mode 100644 index 0000000..ce140ec --- /dev/null +++ b/DIMS/UnidentifiedPeakGrouping.nf @@ -0,0 +1,19 @@ +process UnidentifiedPeakGrouping { + tag "DIMS UnidentifiedPeakGrouping ${unidentified_spectrumpeaks_files}" + label 'UnidentifiedPeakGrouping' + container = 'docker://umcugenbioinf/dims:1.3' + shell = ['/bin/bash', '-euo', 'pipefail'] + + input: + path(unidentified_spectrumpeaks_files) + each path(replication_pattern) + + output: + path('*_Unidentified.txt') + path('*_Unidentified.RData'), emit: grouped_unidentified + + script: + """ + Rscript ${baseDir}/CustomModules/DIMS/UnidentifiedPeakGrouping.R $unidentified_spectrumpeaks_files $params.resolution $params.ppm + """ +} diff --git a/DIMS/Utils/RawFiles.nf b/DIMS/Utils/RawFiles.nf new file mode 100644 index 0000000..8cc0efe --- /dev/null +++ b/DIMS/Utils/RawFiles.nf @@ -0,0 +1,12 @@ +def extractRawfilesFromDir(dir) { + // Original code from: https://github.com/SciLifeLab/Sarek - MIT License - Copyright (c) 2016 SciLifeLab + dir = dir.tokenize().collect{"$it/*.raw"} + Channel + .fromPath(dir, type:'file') + .ifEmpty { error "No raw files found in ${dir}." } + .map { rawfiles_path -> + def file_id = rawfiles_path.getSimpleName() + [file_id, rawfiles_path] + } +} + diff --git a/DIMS/Utils/add_lab_id_and_onderzoeksnummer.R b/DIMS/Utils/add_lab_id_and_onderzoeksnummer.R new file mode 100644 index 0000000..f2ff13e --- /dev/null +++ b/DIMS/Utils/add_lab_id_and_onderzoeksnummer.R @@ -0,0 +1,16 @@ +add_lab_id_and_onderzoeksnummer <- function(df_metabs_helix) { + #' Adding labnummer and Onderzoeksnummer to a dataframe + #' + #' @param df_metabs_helix: dataframe with patient data to be uploaded to Helix + #' + #' @return: dataframe with added labnummer and Onderzoeksnummer columns + + # Split patient number into labnummer and Onderzoeksnummer + for (row in 1:nrow(df_metabs_helix)) { + df_metabs_helix[row, "labnummer"] <- gsub("^P|\\.[0-9]*", "", df_metabs_helix[row, "Patient"]) + labnummer_split <- strsplit(as.character(df_metabs_helix[row, "labnummer"]), "M")[[1]] + df_metabs_helix[row, "Onderzoeksnummer"] <- paste0("MB", labnummer_split[1], "/", labnummer_split[2]) + } + + return(df_metabs_helix) +} diff --git a/DIMS/Utils/atomic_info.R b/DIMS/Utils/atomic_info.R new file mode 100644 index 0000000..94d0e83 --- /dev/null +++ b/DIMS/Utils/atomic_info.R @@ -0,0 +1,92 @@ +## adapted from globalAssignments.HPC.R +# relative abundancies from theoretical mass and composition +# not a function, but a large amount of objects that will become available in memory +# refactor: include in get_element_info +# check if all these elements are necessary. Only use snake_make for Hmass -> hydrogen_mass. +# move source library to higher level +options(digits = 16) + +suppressPackageStartupMessages(library(lattice)) + +# The following list was copied from Rdisop elements.R and corrected for C, H, O, Cl, S according to NIST +Ba <- list(name = "Ba", mass = 130, isotope = list(mass = c(-0.093718, 0, -0.094958, 0, -0.095514, -0.094335, -0.095447, -0.094188, -0.094768), abundance = c(0.00106, 0, 0.00101, 0, 0.02417, 0.06592, 0.07854, 0.1123, 0.717))) +Br <- list(name = "Br", mass = 79, isotope = list(mass = c(-0.0816639, 0, -0.083711), abundance = c(0.5069, 0, 0.4931))) +C <- list(name = "C", mass = 12, isotope = list(mass = c(0, 0.003354838, 0.003241989), abundance = c(0.9893, 0.0107, 0))) +Ca <- list(name = "Ca", mass = 40, isotope = list(mass = c(-0.0374094, 0, -0.0413824, -0.0412338, -0.0445194, 0, -0.046311, 0, -0.047467), abundance = c(0.96941, 0, 0.00647, 0.00135, 0.02086, 0, 4e-05, 0, 0.00187))) +Cl <- list(name = "Cl", mass = 35, isotope = list(mass = c(-0.03114732, 0, -0.03409741), abundance = c(0.7576, 0, 0.2424))) +Cr <- list(name = "Cr", mass = 50, isotope = list(mass = c(-0.0539536, 0, -0.0594902, -0.0593487, -0.0611175), abundance = c(0.04345, 0, 0.83789, 0.09501, 0.02365))) +Cu <- list(name = "Cu", mass = 63, isotope = list(mass = c(-0.0704011, 0, -0.0722071), abundance = c(0.6917, 0, 0.3083))) +F <- list(name = "F", mass = 19, isotope = list(mass = c(-0.00159678), abundance = c(1))) +Fe <- list(name = "Fe", mass = 54, isotope = list(mass = c(-0.0603873, 0, -0.0650607, -0.0646042, -0.0667227), abundance = c(0.058, 0, 0.9172, 0.022, 0.0028))) +H <- list(name = "H", mass = 1, isotope = list(mass = c(0.00782503207, 0.014101778, 0.01604928), abundance = c(0.999885, 0.000115, 0))) +Hg <- list(name = "Hg", mass = 196, isotope = list(mass = c(-0.034193, 0, -0.033257, -0.031746, -0.0317, -0.029723, -0.029383, 0, -0.026533), abundance = c(0.0015, 0, 0.0997, 0.1687, 0.231, 0.1318, 0.2986, 0, 0.0687))) +I <- list(name = "I", mass = 127, isotope = list(mass = c(-0.095527), abundance = c(1))) +K <- list(name = "K", mass = 39, isotope = list(mass = c(-0.0362926, -0.0360008, -0.0381746), abundance = c(0.932581, 0.000117, 0.067302))) +Li <- list(name = "Li", mass = 6, isotope = list(mass = c(0.0151214, 0.016003), abundance = c(0.075, 0.925))) +Mg <- list(name = "Mg", mass = 24, isotope = list(mass = c(-0.0149577, -0.0141626, -0.0174063), abundance = c(0.7899, 0.1, 0.1101))) +Mn <- list(name = "Mn", mass = 55, isotope = list(mass = c(-0.0619529), abundance = c(1))) +N <- list(name = "N", mass = 14, isotope = list(mass = c(0.003074002, 0.00010897), abundance = c(0.99634, 0.00366))) +Na <- list(name = "Na", mass = 23, isotope = list(mass = c(-0.0102323), abundance = c(1))) +Ni <- list(name = "Ni", mass = 58, isotope = list(mass = c(-0.0646538, 0, -0.0692116, -0.0689421, -0.0716539, 0, -0.0720321), abundance = c(0.68077, 0, 0.26223, 0.0114, 0.03634, 0, 0.00926))) +O <- list(name = "O", mass = 16, isotope = list(mass = c(-0.00508538044, -0.0008683, -0.0008397), abundance = c(0.99757, 0.000381, 0.00205))) +P <- list(name = "P", mass = 31, isotope = list(mass = c(-0.026238), abundance = c(1))) +S <- list(name = "S", mass = 32, isotope = list(mass = c(-0.027929, -0.02854124, -0.0321331, 0, -0.03291924), abundance = c(0.9499, 0.0075, 0.0425, 0, 1e-04))) +Se <- list(name = "Se", mass = 74, isotope = list(mass = c(-0.0775254, 0, -0.080788, -0.0800875, -0.0826924, 0, -0.0834804, 0, -0.0833022), abundance = c(0.0089, 0, 0.0936, 0.0763, 0.2378, 0, 0.4961, 0, 0.0873))) +Si <- list(name = "Si", mass = 28, isotope = list(mass = c(-0.0230729, -0.0235051, -0.0262293), abundance = c(0.9223, 0.0467, 0.031))) +Sn <- list(name = "Sn", mass = 112, isotope = list(mass = c(-0.095174, 0, -0.097216, -0.096652, -0.098253, -0.097044, -0.098391, -0.09669, -0.0978009, 0, -0.0965596, 0, -0.0947257), abundance = c(0.0097, 0, 0.0065, 0.0034, 0.1453, 0.0768, 0.2423, 0.0859, 0.3259, 0, 0.0463, 0, 0.0579))) +Zn <- list(name = "Zn", mass = 64, isotope = list(mass = c(-0.0708552, 0, -0.0739653, -0.0728709, -0.0751541, 0, -0.074675), abundance = c(0.486, 0, 0.279, 0.041, 0.188, 0, 0.006))) + +# The following list is for our own use +NH4 <- list(name = "NH4", mass = 18, isotope = list(mass = c(0.03437, 0.03141, -0.95935)), abundance = c(0.995, 0.004, 0.001)) # SISweb: 18.03437 100 19.03141 0.4 19.04065 0.1 +Ac <- list(name = "Ac", mass = 60, isotope = list(mass = c(0.02114, 0.02450, 0.02538)), abundance = c(0.975, 0.021, 0.004)) # SISweb: 60.02114 100 61.02450 2.2 62.02538 0.4 +NaCl <- list(name = "NaCl", mass = 58, isotope = list(mass = c(-0.04137, 0, -0.04433)), abundance = c(0.755, 0, 0.245)) # SISweb: 57.95862 100 59.95567 32.4 +NaCl2 <- list(name = "NaCl2", mass = 116, isotope = list(mass = c(-0.08274, 0, -0.08866)), abundance = c(0.755, 0, 0.245)) # SISweb: 57.95862 100 59.95567 32.4 +NaCl3 <- list(name = "NaCl3", mass = 174, isotope = list(mass = c(-0.12411, 0, -0.13299)), abundance = c(0.755, 0, 0.245)) # SISweb: 57.95862 100 59.95567 32.4 +NaCl4 <- list(name = "NaCl4", mass = 232, isotope = list(mass = c(-0.16548, 0, -0.17732)), abundance = c(0.755, 0, 0.245)) # SISweb: 57.95862 100 59.95567 32.4 +NaCl5 <- list(name = "NaCl5", mass = 290, isotope = list(mass = c(-0.20685, 0, -0.22165)), abundance = c(0.755, 0, 0.245)) # SISweb: 57.95862 100 59.95567 32.4 +For <- list(name = "For", mass = 45, isotope = list(mass = c(-0.00233, 0.00103)), abundance = c(0.989, 0.011)) # SISweb: 46.00549 100 47.00885 1.1 (47.0097 0.1) 48.00973 0.4 +Na2 <- list(name = "2Na-H", mass = 46, isotope = list(mass = c(-1.0282896)), abundance = c(1)) # SISweb for Na2: 45.97954 100 # minus 1 H ! +Met <- list(name = "CH3OH", mass = 32, isotope = list(mass = c(1.034045, 1.037405)), abundance = c(0.989, 0.011)) # SISweb: 32.02622 100 33.02958 1.1 33.0325 0.1 34.03046 0.2 +CH3OH <- list(name = "CH3OH", mass = 32, isotope = list(mass = c(1.034045, 1.037405)), abundance = c(0.989, 0.011)) # SISweb: 32.02622 100 33.02958 1.1 33.0325 0.1 34.03046 0.2 +Na3 <- list(name = "3Na-2H", mass = 69, isotope = list(mass = c(-2.0463469)), abundance = c(1)) # SISweb for Na2: 45.97954 100 # minus 1 H ! +KCl <- list(name = "KCl", mass = 74, isotope = list(mass = c(-0.06744, 0.92961, -0.06744, 0.92772)), abundance = c(0.7047, 0.2283, 0.0507, 0.0162)) # SISweb: 73.93256 100 75.92961 32.4 75.93067 7.2 77.92772 2.3 +H2PO4 <- list(name = "H2PO4", mass = 97, isotope = list(mass = c(-0.03091)), abundance = c(1)) +HSO4 <- list(name = "HSO4", mass = 97, isotope = list(mass = c(-0.04042, 0, -0.04462)), abundance = c(0.96, 0, 0.04)) +Met2 <- list(name = "Met2", mass = 64, isotope = list(mass = c(1.060265, 1.013405)), abundance = c(0.978, 0.022)) +Met3 <- list(name = "Met3", mass = 96, isotope = list(mass = c(1.086485, 1.089845)), abundance = c(0.969, 0.031)) +Met4 <- list(name = "Met4", mass = 128, isotope = list(mass = c(1.112705, 1.116065)), abundance = c(0.959, 0.041)) +Met5 <- list(name = "Met5", mass = 160, isotope = list(mass = c(1.20935, 1.142285)), abundance = c(0.949, 0.051)) +NaminH <- list(name = "Na-H", mass = 21, isotope = list(mass = c(-0.02571416)), abundance = c(1)) +KminH <- list(name = "K-H", mass = 37, isotope = list(mass = c(-0.05194, 0.94617)), abundance = c(0.9328, 0.0672)) +H2O <- list(name = "H2O", mass = -19, isotope = list(mass = c(-0.01894358)), abundance = c(1)) +NaK <- list(name = "NaK-H", mass = 61, isotope = list(mass = c(-0.054345, 0.943765)), abundance = c(0.9328, 0.0672)) +min2H <- list(name = "min2H", mass = -2, isotope = list(mass = c(-0.0151014)), abundance = c(1)) +plus2H <- list(name = "plus2H", mass = 2, isotope = list(mass = c(0.0151014)), abundance = c(1)) +plus2Na <- list(name = "plus2Na", mass = 46, isotope = list(mass = c(-0.02046), abundance = c(1))) +plusNaH <- list(name = "plusNaH", mass = 24, isotope = list(mass = c(-0.00295588), abundance = c(1))) +plusKH <- list(name = "plusKH", mass = 40, isotope = list(mass = c(-0.029008, 0.969101)), abundance = c(0.9328, 0.0672)) +plusHNH <- list(name = "plusHNH", mass = 19, isotope = list(mass = c(0.04164642)), abundance = c(1)) +min3H <- list(name = "min3H", mass = -3, isotope = list(mass = c(-0.02182926)), abundance = c(1)) +plus3H <- list(name = "plus3H", mass = 3, isotope = list(mass = c(0.02182926)), abundance = c(1)) +plus3Na <- list(name = "plus3Na", mass = 68, isotope = list(mass = c(0.96931)), abundance = c(1)) +plus2NaH <- list(name = "plus2NaH", mass = 47, isotope = list(mass = c(0.2985712)), abundance = c(1)) +plusNa2H <- list(name = "plusNa2H", mass = 25, isotope = list(mass = c(0.00432284)), abundance = c(1)) + +# add first isotope of Cl because of high abundance +Cl37 <- list(name = "Cl37", mass = 37, isotope = list(mass = c(-0.03409741)), abundance = c(1)) + +all_elements <- list(Ba, Br, C, Ca, Cl, Cr, Cu, F, Fe, H, Hg, I, K, Li, Mg, Mn, N, Na, Ni, O, P, S, Se, Si, Sn, Zn) +all_adducts <- list(Ba, Br, Ca, Cl, Cl37, Cr, Cu, Fe, Hg, I, K, Li, Mg, Mn, Na, Ni, Se, Si, Sn, Zn, + NH4, Ac, NaCl, For, Na2, CH3OH, NaCl2, NaCl3, NaCl4, NaCl5, Na3, KCl, H2PO4, HSO4, + Met2, Met3, Met4, Met5, NaminH, KminH, H2O, NaK, min2H, plus2H, plus2Na, plusNaH, + plusKH, min3H, plus3H, plusHNH, plus3Na, plus2NaH, plusNa2H) + +atoms_inuse <- c("P", "O", "N", "C", "H", "S", "Cl", "D", "13C", "34S", "18O", "37Cl") +atomic_weights <- c(30.97376163, 15.99491463, 14.0030740052, 12.0000, 1.0078250321, 31.9720707, 34.968852721, 2.0141017778, 13.0033548378, 33.96786690, 17.9991610, 36.96590259) +electron <- 0.00054858 + +hydrogen_mass <- Hmass <- H$mass + H$isotope$mass[1] +Dmass <- H$mass + 1 + H$isotope$mass[2] +Tmass <- H$mass + 2 + H$isotope$mass[3] +C13mass <- C$mass + 1 + C$isotope$mass[2] +N15mass <- N$mass + 1 + N$isotope$mass[2] diff --git a/DIMS/Utils/calculate_zscores.R b/DIMS/Utils/calculate_zscores.R new file mode 100644 index 0000000..d687376 --- /dev/null +++ b/DIMS/Utils/calculate_zscores.R @@ -0,0 +1,63 @@ +## adapted from statistics_z.R +# refactor: change column names from avg.ctrls to avg_ctrls, sd.ctrls to sd_ctrls +# check logic of parameter adducts +calculate_zscores <- function(peakgroup_list, adducts) { + #' Calculate Z-scores for peak groups based on average and standard deviation of controls + #' + #' @param peakgroup_list: Peak group list (matrix) + #' @param sort_col: Column to sort on (string) + #' @param adducts: Parameter indicating whether there are adducts in the list (boolean) + #' + #' @return peakgroup_list_dedup: de-duplicated peak group list (matrix) + + case_label <- "P" + control_label <- "C" + # get index for new column names + startcol <- ncol(peakgroup_list) + 3 + + # calculate mean and standard deviation for Control group + ctrl_cols <- grep(control_label, colnames(peakgroup_list), fixed = TRUE) + case_cols <- grep(case_label, colnames(peakgroup_list), fixed = TRUE) + int_cols <- c(ctrl_cols, case_cols) + # set al zeros to NA + peakgroup_list[, int_cols][peakgroup_list[, int_cols] == 0] <- NA + ctrl_ints <- peakgroup_list[, ctrl_cols, drop = FALSE] + peakgroup_list$avg.ctrls <- apply(ctrl_ints, 1, function(x) mean(as.numeric(x), na.rm = TRUE)) + peakgroup_list$sd.ctrls <- apply(ctrl_ints, 1, function(x) sd(as.numeric(x), na.rm = TRUE)) + + # set new column names and calculate Z-scores + colnames_zscores <- NULL + for (col_index in int_cols) { + col_name <- colnames(peakgroup_list)[col_index] + colnames_zscores <- c(colnames_zscores, paste0(col_name, "_Zscore")) + zscores_1col <- (as.numeric(as.vector(unlist(peakgroup_list[, col_index]))) - + peakgroup_list$avg.ctrls) / peakgroup_list$sd.ctrls + peakgroup_list <- cbind(peakgroup_list, zscores_1col) + } + + # apply new column names to columns at end plus avg and sd columns + colnames(peakgroup_list)[startcol:ncol(peakgroup_list)] <- colnames_zscores + + # add ppm deviation column + zscore_cols <- grep("Zscore", colnames(peakgroup_list), fixed = TRUE) + if (!adducts) { + if ((dim(peakgroup_list[, zscore_cols])[2] + 6) != (startcol - 1)) { + ppmdev <- array(1:nrow(peakgroup_list), dim = c(nrow(peakgroup_list))) + # calculate ppm deviation + for (i in 1:nrow(peakgroup_list)) { + if (!is.na(peakgroup_list$theormz_HMDB[i]) && + !is.null(peakgroup_list$theormz_HMDB[i]) && + (peakgroup_list$theormz_HMDB[i] != "")) { + ppmdev[i] <- 10^6 * (as.numeric(as.vector(peakgroup_list$mzmed.pgrp[i])) - + as.numeric(as.vector(peakgroup_list$theormz_HMDB[i]))) / + as.numeric(as.vector(peakgroup_list$theormz_HMDB[i])) + } else { + ppmdev[i] <- NA + } + } + peakgroup_list <- cbind(peakgroup_list[, 1:6], ppmdev = ppmdev, peakgroup_list[, 7:ncol(peakgroup_list)]) + } + } + + return(peakgroup_list) +} diff --git a/DIMS/Utils/check_overlap.R b/DIMS/Utils/check_overlap.R new file mode 100644 index 0000000..66492e8 --- /dev/null +++ b/DIMS/Utils/check_overlap.R @@ -0,0 +1,24 @@ +## adapted from checkOverlap.R +check_overlap <- function(range1, range2) { + #' Modify range1 and range2 in case of overlap + #' + #' @param range1: Vector of m/z values for first peak (float) + #' @param range2: Vector of m/z values for second peak (float) + #' + #' @return new_ranges: list of two ranges (list) + + # Check for overlap + if (length(intersect(range1, range2)) == 2) { + if (length(range1) >= length(range2)) { + range1 <- range1[-length(range1)] + } else { + range2 <- range2[-1] + } + } else if (length(intersect(range1, range2)) == 3) { + range1 <- range1[-length(range1)] + range2 <- range2[-1] + } + new_ranges <- list("range1" = range1, "range2" = range2) + return(new_ranges) +} + diff --git a/DIMS/Utils/check_same_samplename.R b/DIMS/Utils/check_same_samplename.R new file mode 100644 index 0000000..6e80841 --- /dev/null +++ b/DIMS/Utils/check_same_samplename.R @@ -0,0 +1,10 @@ +check_same_samplename <- function(int_col_name, zscore_col_name) { + #' A check to see if intensity and Z-score columns match + #' + #' @param int_col_name: name of an intensity column (string) + #' @param zscore_col_name: name of a Z-score column (string) + #' + #' @return: match or mismatch of the columns (boolean) + + paste0(int_col_name, "_Zscore") == zscore_col_name +} diff --git a/DIMS/Utils/create_violin_plots.R b/DIMS/Utils/create_violin_plots.R new file mode 100644 index 0000000..be4f1f6 --- /dev/null +++ b/DIMS/Utils/create_violin_plots.R @@ -0,0 +1,122 @@ +# remove parameter default value +# add explanation variable to parameters +create_violin_plots <- function(pdf_dir, pt_name, metab_perpage, top_metab_pt = NULL) { + #' Create violin plots for each patient + #' + #' @param pdf_dir: location where to save the pdf file (string) + #' @param pt_name: patient code (string) + #' @param metab_perpage: list of dataframe with a dataframe for each page in the violinplot pdf (list) + #' @param top_metab_pt: dataframe with increased and decrease metabolites for this patient (dataframe) + + # set parameters for plots + plot_height <- 9.6 + plot_width <- 6 + fontsize <- 1 + circlesize <- 0.8 + colors_4plot <- c("#22E4AC", "#00B0F0", "#504FFF", "#A704FD", "#F36265", "#DA0641") + # green blue blue/purple purple orange red + + # patient plots, create the PDF device + pt_name_sub <- pt_name + suffix <- "" + if (grepl("Diagnostics", pdf_dir) & is_diagnostic_patient(pt_name)) { + prefix <- "MB" + suffix <- "_DIMS_PL_DIAG" + # substitute P and M in P2020M00001 into right format for Helix + pt_name_sub <- gsub("[PM]", "", pt_name) + pt_name_sub <- gsub("\\..*", "", pt_name_sub) + } else if (grepl("Diagnostics", pdf_dir)) { + prefix <- "Dx_" + } else if (grepl("IEM", pdf_dir)) { + prefix <- "IEM_" + } else { + prefix <- "R_" + } + + pdf(paste0(pdf_dir, "/", prefix, pt_name_sub, suffix, ".pdf"), + onefile = TRUE, + width = plot_width, + height = plot_height) + + # page headers: + page_headers <- names(metab_perpage) + + # put table into PDF file, if not empty + if (!is.null(dim(top_metab_pt))) { + plot.new() + # get the names and numbers in the table aligned + table_theme <- ttheme_default(core = list(fg_params = list(hjust = 0, x = 0.05, fontsize = 6)), + colhead = list(fg_params = list(fontsize = 8, fontface = "bold"))) + grid.table(top_metab_pt, theme = table_theme, rows = NULL) + # g <- tableGrob(top_metab_pt) + # grid.draw(g) + text(x = 0.45, y = 1.02, paste0("Top deviating metabolites for patient: ", pt_name), font = 1, cex = 1) + } + + # violin plots + for (page_index in 1:length(metab_perpage)) { + # extract list of metabolites to plot on a page + metab_list_2plot <- metab_perpage[[page_index]] + # extract original data for patient of interest (pt_name) before cut-offs + pt_list_2plot_orig <- metab_list_2plot[which(metab_list_2plot$variable == pt_name), ] + # cut off Z-scores higher than 20 or lower than -5 (for nicer plots) + metab_list_2plot$value[metab_list_2plot$value > 20] <- 20 + metab_list_2plot$value[metab_list_2plot$value < -5] <- -5 + # extract data for patient of interest (pt_name) + pt_list_2plot <- metab_list_2plot[which(metab_list_2plot$variable == pt_name), ] + # restore original Z-score before cut-off, for showing Z-scores in PDF + pt_list_2plot$value_orig <- pt_list_2plot_orig$value + # remove patient of interest (pt_name) from list; violins will be made up of controls and other patients + metab_list_2plot <- metab_list_2plot[-which(metab_list_2plot$variable == pt_name), ] + # subtitle per page + sub_perpage <- gsub("_", " ", page_headers[page_index]) + # for IEM plots, put subtitle on two lines + sub_perpage <- gsub("probability", "\nprobability", sub_perpage) + # add size parameter for showing Z-score of patient per metabolite + z_size <- rep(3, nrow(pt_list_2plot)) + # set size to 0 if row is empty + z_size[is.na(pt_list_2plot$value)] <- 0 + + # draw violin plot. shape=22 gives square for patient of interest + ggplot_object <- ggplot(metab_list_2plot, aes(x = value, y = HMDB_name)) + + theme(axis.text.y = element_text(size = rel(fontsize)), plot.caption = element_text(size = rel(fontsize))) + + xlim(-5, 20) + + geom_violin(scale = "width") + + geom_point(data = pt_list_2plot, aes(color = value), size = 3.5 * circlesize, shape = 22, fill = "white") + + scale_fill_gradientn( + colors = colors_4plot, values = NULL, space = "Lab", na.value = "grey50", guide = "colourbar", + aesthetics = "colour" + ) + + # add Z-score value for patient of interest at x=16 + geom_text( + data = pt_list_2plot, aes(16, label = paste0("Z=", round(value_orig, 2))), hjust = "left", vjust = +0.2, + size = z_size + ) + + # add labels. Use font Courier to get all the plots in the same location. + labs(x = "Z-scores", y = "Metabolites", subtitle = sub_perpage, color = "z-score") + + theme(axis.text.y = element_text(family = "Courier", size = 6)) + + # do not show legend + theme(legend.position = "none") + + # add title + ggtitle(label = paste0("Results for patient ", pt_name)) + + # add vertical lines + geom_vline(xintercept = 2, col = "grey", lwd = 0.5, lty = 2) + + geom_vline(xintercept = -2, col = "grey", lwd = 0.5, lty = 2) + + suppressWarnings(print(ggplot_object)) + + } + + # add explanation of violin plots, version number etc. + plot(NA, xlim = c(0, 5), ylim = c(0, 5), bty = "n", xaxt = "n", yaxt = "n", xlab = "", ylab = "") + if (length(explanation) > 0) { + text(0.2, 5, explanation[1], pos = 4, cex = 0.8) + for (line_index in 2:length(explanation)) { + text_y_position <- 5 - (line_index * 0.2) + text(-0.2, text_y_position, explanation[line_index], pos = 4, cex = 0.5) + } + } + + # close the PDF file + dev.off() +} diff --git a/DIMS/Utils/do_peakfinding.R b/DIMS/Utils/do_peakfinding.R new file mode 100644 index 0000000..fff6afe --- /dev/null +++ b/DIMS/Utils/do_peakfinding.R @@ -0,0 +1,55 @@ +## adapted from findpeaks.Gauss.HPC.R +# NB: this function will be taken up into PeakFinding.R +# variables with fixed values will be removed from function parameters +# int_factor, scale, outdir, plot, thresh, width, height +do_peakfinding <- function(sample_avgtechrepl, int_factor, scale, resol, outdir, scanmode, plot, thresh, width, height) { + #' start peak finding + #' + #' @param sample_avgtechrepl: Dataframe with binned intensities averaged over technical replicates for a sample + #' @param int_factor: Value used to calculate area under Gaussian curve (integer) + #' @param scale: Initial value used to estimate scaling parameter (integer) + #' @param resol: Value for resolution (integer) + #' @param outdir: Path for output directory (string) + #' @param scanmode: Scan mode, positive or negative (string) + #' @param plot: Parameter indicating whether plots should be made (boolean) + #' @param thresh: Value for noise level threshold (integer) + #' @param width: Value for width of plot (integer) + #' @param height: Value for height of plot (integer) + #' + #' @return save output to file + + sample_name <- colnames(sample_avgtechrepl)[1] + + # turn dataframe with intensities into a named list + ints_fullrange <- as.vector(sample_avgtechrepl) + names(ints_fullrange) <- rownames(sample_avgtechrepl) + + # initialise list to store results for all peaks + allpeaks_values <- list("mean" = NULL, "area" = NULL, "nr" = NULL, + "min" = NULL, "max" = NULL, "qual" = NULL, "spikes" = 0) + + # look for m/z range for all peaks + allpeaks_values <- search_mzrange(ints_fullrange, allpeaks_values, int_factor, scale, resol, + outdir, sample_name, scanmode, + plot, width, height, thresh) + + # turn the list into a dataframe + outlist_persample <- NULL + outlist_persample <- cbind("samplenr" = allpeaks_values$nr, + "mzmed.pkt" = allpeaks_values$mean, + "fq" = allpeaks_values$qual, + "mzmin.pkt" = allpeaks_values$min, + "mzmax.pkt" = allpeaks_values$max, + "height.pkt" = allpeaks_values$area) + + # remove peaks with height = 0 + outlist_persample <- outlist_persample[outlist_persample[, "height.pkt"] != 0, ] + + # save output to file + save(outlist_persample, file = paste0(sample_name, "_", scanmode, ".RData")) + + # generate text output to log file on number of spikes for this sample + # spikes are peaks that are too narrow, e.g. 1 data point + cat(paste("There were", allpeaks_values$spikes, "spikes")) +} + diff --git a/DIMS/Utils/estimate_area.R b/DIMS/Utils/estimate_area.R new file mode 100644 index 0000000..b191f55 --- /dev/null +++ b/DIMS/Utils/estimate_area.R @@ -0,0 +1,30 @@ +## adapted from getArea.R +# variables with fixed values will be removed from function parameters +# int_factor +estimate_area <- function(mass_max, resol, scale, sigma, int_factor) { + #' Estimate area of Gaussian curve + #' + #' @param mass_max: Value for m/z at maximum intensity of a peak (float) + #' @param resol: Value for resolution (integer) + #' @param scale: Value for peak width (float) + #' @param sigma: Value for standard deviation (float) + #' @param int_factor: Value used to calculate area under Gaussian curve (integer) + #' + #' @return area_curve: Value for area under the Gaussian curve (float) + + # avoid vectors that are too big (cannot allocate vector of size ...) + if (mass_max > 1200) return(0) + + # generate a mass_vector with equally spaced m/z values + fwhm <- get_fwhm(mass_max, resol) + mz_min <- mass_max - 2 * fwhm + mz_max <- mass_max + 2 * fwhm + mz_range <- mz_max - mz_min + mass_vector2 <- seq(mz_min, mz_max, length = mz_range * int_factor) + + # estimate area under the curve + area_curve <- sum(scale * dnorm(mass_vector2, mass_max, sigma)) / 100 + + return(area_curve) +} + diff --git a/DIMS/Utils/fit_gaussian.R b/DIMS/Utils/fit_gaussian.R new file mode 100644 index 0000000..6be9660 --- /dev/null +++ b/DIMS/Utils/fit_gaussian.R @@ -0,0 +1,319 @@ +## adapted from fitGaussian.R +# variables with fixed values will be removed from function parameters +# scale, outdir, plot, width, height +# max_index doesn't need to be passed to this function, can be determined here. +# remove plot sections (commented out) +# several functions need to be loaded before this function can run +fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale, resol, + outdir, force, use_bounds, plot, scanmode, + int_factor, width, height) { + #' Fit 1, 2, 3 or 4 Gaussian peaks in small region of m/z + #' + #' @param mass_vector2: Vector of equally spaced m/z values (float) + #' @param mass_vector: Vector of m/z values for a region of interest (float) + #' @param int_vector: Value used to calculate area under Gaussian curve (integer) + #' @param max_index: Index in int_vector with the highest intensity (integer) + #' @param scale: Initial value used to estimate scaling parameter (integer) + #' @param resol: Value for resolution (integer) + #' @param outdir: Path for output directory (string) + #' @param force: Number of local maxima in int_vector (integer) + #' @param use_bounds: Boolean to indicate whether boundaries are to be used + #' @param plot: Parameter indicating whether plots should be made (boolean) + #' @param scanmode: Scan mode, positive or negative (string) + #' @param int_factor: Value used to calculate area under Gaussian curve (integer) + #' @param width: Value for width of plot (integer) + #' @param height: Value for height of plot (integer) + #' + #' @return roi_value_list: list of fit values for region of interest (list) + + # Initialise + peak_mean <- NULL + peak_area <- NULL + peak_qual <- NULL + peak_min <- NULL + peak_max <- NULL + fit_quality1 <- 0.15 + fit_quality <- 0.2 + + # One local maximum: + if (force == 1) { + # determine fit values for 1 Gaussian peak (mean, scale, sigma, qual) + fit_values <- fit_1peak(mass_vector2, mass_vector, int_vector, max_index, scale, resol, + plot, fit_quality1, use_bounds) + # set initial value for scale factor + scale <- 2 + # test if the mean is outside the m/z range + if (fit_values$mean[1] < mass_vector[1] || fit_values$mean[1] > mass_vector[length(mass_vector)]) { + # run this function again with fixed boundaries + return(fit_gaussian(mass_vector2, mass_vector, int_vector, max_index, scale, resol, + outdir, force = 1, use_bounds = TRUE, plot, scanmode, int_factor, width, height)) + } else { + # test if the fit is bad + if (fit_values$qual > fit_quality1) { + # Try to fit two curves; find two local maxima + new_index <- which(diff(sign(diff(int_vector))) == -2) + 1 + # test if there are two indices in new_index + if (length(new_index) != 2) { + new_index <- round(length(mass_vector) / 3) + new_index <- c(new_index, 2 * new_index) + } + # run this function again with two local maxima + return(fit_gaussian(mass_vector2, mass_vector, int_vector, new_index, + scale, resol, outdir, force = 2, use_bounds = FALSE, + plot, scanmode, int_factor, width, height)) + # good fit + } else { + peak_mean <- c(peak_mean, fit_values$mean) + peak_area <- c(peak_area, estimate_area(fit_values$mean, resol, fit_values$scale, + fit_values$sigma, int_factor)) + peak_qual <- fit_values$qual + peak_min <- mass_vector[1] + peak_max <- mass_vector[length(mass_vector)] + } + } + + #### Two local maxima; need at least 6 data points for this #### + } else if (force == 2 && (length(mass_vector) > 6)) { + # determine fit values for 2 Gaussian peaks (mean, scale, sigma, qual) + fit_values <- fit_2peaks(mass_vector2, mass_vector, int_vector, max_index, scale, resol, + use_bounds, plot, fit_quality, int_factor) + # test if one of the means is outside the m/z range + if (fit_values$mean[1] < mass_vector[1] || fit_values$mean[1] > mass_vector[length(mass_vector)] || + fit_values$mean[2] < mass_vector[1] || fit_values$mean[2] > mass_vector[length(mass_vector)]) { + # check if fit quality is bad + if (fit_values$qual > fit_quality) { + # run this function again with fixed boundaries + return(fit_gaussian(mass_vector2, mass_vector, int_vector, max_index, scale, resol, + outdir, force = 2, use_bounds = TRUE, + plot, scanmode, int_factor, width, height)) + } else { + # check which mean is outside range and remove it from the list of means + # NB: peak_mean and other variables have not been given values from 2-peak fit yet! + for (i in 1:length(fit_values$mean)){ + if (fit_values$mean[i] < mass_vector[1] || fit_values$mean[i] > mass_vector[length(mass_vector)]) { + peak_mean <- c(peak_mean, -i) + peak_area <- c(peak_area, -i) + } else { + peak_mean <- c(peak_mean, fit_values$mean[i]) + peak_area <- c(peak_area, fit_values$area[i]) + } + } + peak_qual <- fit_values$qual + peak_min <- mass_vector[1] + peak_max <- mass_vector[length(mass_vector)] + } + # if all means are within range + } else { + # check for bad fit + if (fit_values$qual > fit_quality) { + # Try to fit three curves; find three local maxima + new_index <- which(diff(sign(diff(int_vector))) == -2) + 1 + # test if there are three indices in new_index + if (length(new_index) != 3) { + new_index <- round(length(mass_vector) / 4) + new_index <- c(new_index, 2 * new_index, 3 * new_index) + } + # run this function again with three local maxima + return(fit_gaussian(mass_vector2, mass_vector, int_vector, new_index, + scale, resol, outdir, force = 3, use_bounds = FALSE, + plot, scanmode, int_factor, width, height)) + # good fit, all means are within m/z range + } else { + # check if means are within 3 ppm and sum if so + tmp <- fit_values$qual + nr_means_new <- -1 + nr_means <- length(fit_values$mean) + while (nr_means != nr_means_new) { + nr_means <- length(fit_values$mean) + fit_values <- within_ppm(fit_values$mean, fit_values$scale, fit_values$sigma, fit_values$area, + mass_vector2, mass_vector, ppm = 4, resol, plot) + nr_means_new <- length(fit_values$mean) + } + # restore original quality score + fit_values$qual <- tmp + + for (i in 1:length(fit_values$mean)){ + peak_mean <- c(peak_mean, fit_values$mean[i]) + peak_area <- c(peak_area, fit_values$area[i]) + } + peak_qual <- fit_values$qual + peak_min <- mass_vector[1] + peak_max <- mass_vector[length(mass_vector)] + } + } + + # Three local maxima; need at least 6 data points for this + } else if (force == 3 && (length(mass_vector) > 6)) { + # determine fit values for 3 Gaussian peaks (mean, scale, sigma, qual) + fit_values <- fit_3peaks(mass_vector2, mass_vector, int_vector, max_index, scale, resol, + use_bounds, plot, fit_quality, int_factor) + # test if one of the means is outside the m/z range + if (fit_values$mean[1] < mass_vector[1] || fit_values$mean[1] > mass_vector[length(mass_vector)] || + fit_values$mean[2] < mass_vector[1] || fit_values$mean[2] > mass_vector[length(mass_vector)] || + fit_values$mean[3] < mass_vector[1] || fit_values$mean[3] > mass_vector[length(mass_vector)]) { + # check if fit quality is bad + if (fit_values$qual > fit_quality) { + # run this function again with fixed boundaries + return(fit_gaussian(mass_vector2, mass_vector, int_vector, max_index, scale, resol, + outdir, force, use_bounds = TRUE, + plot, scanmode, int_factor, width, height)) + } else { + # check which mean is outside range and remove it from the list of means + # NB: peak_mean and other variables have not been given values from 2-peak fit yet! + for (i in 1:length(fit_values$mean)) { + if (fit_values$mean[i] < mass_vector[1] || fit_values$mean[i] > mass_vector[length(mass_vector)]) { + peak_mean <- c(peak_mean, -i) + peak_area <- c(peak_area, -i) + } else { + peak_mean <- c(peak_mean, fit_values$mean[i]) + peak_area <- c(peak_area, fit_values$area[i]) + } + } + peak_qual <- fit_values$qual + peak_min <- mass_vector[1] + peak_max <- mass_vector[length(mass_vector)] + } + # if all means are within range + } else { + # check for bad fit + if (fit_values$qual > fit_quality) { + # Try to fit four curves; find four local maxima + new_index <- which(diff(sign(diff(int_vector))) == -2) + 1 + # test if there are four indices in new_index + if (length(new_index) != 4) { + new_index <- round(length(mass_vector) / 5) + new_index <- c(new_index, 2 * new_index, 3 * new_index, 4 * new_index) + } + # run this function again with four local maxima + return(fit_gaussian(mass_vector2, mass_vector, int_vector, new_index, scale, resol, + outdir, force = 4, use_bounds = FALSE, plot, scanmode, + int_factor, width, height)) + # good fit, all means are within m/z range + } else { + # check if means are within 4 ppm and sum if so + tmp <- fit_values$qual + nr_means_new <- -1 + nr_means <- length(fit_values$mean) + while (nr_means != nr_means_new) { + nr_means <- length(fit_values$mean) + fit_values <- within_ppm(fit_values$mean, fit_values$scale, fit_values$sigma, fit_values$area, + mass_vector2, mass_vector, ppm = 4, resol, plot) + nr_means_new <- length(fit_values$mean) + } + # restore original quality score + fit_values$qual <- tmp + + for (i in 1:length(fit_values$mean)){ + peak_mean <- c(peak_mean, fit_values$mean[i]) + peak_area <- c(peak_area, fit_values$area[i]) + } + peak_qual <- fit_values$qual + peak_min <- mass_vector[1] + peak_max <- mass_vector[length(mass_vector)] + + } + } + + #### Four local maxima; need at least 6 data points for this #### + } else if (force == 4 && (length(mass_vector) > 6)) { + # determine fit values for 4 Gaussian peaks (mean, scale, sigma, qual) + fit_values <- fit_4peaks(mass_vector2, mass_vector, int_vector, max_index, scale, resol, + use_bounds, plot, fit_quality, int_factor) + # test if one of the means is outside the m/z range + if (fit_values$mean[1] < mass_vector[1] || fit_values$mean[1] > mass_vector[length(mass_vector)] || + fit_values$mean[2] < mass_vector[1] || fit_values$mean[2] > mass_vector[length(mass_vector)] || + fit_values$mean[3] < mass_vector[1] || fit_values$mean[3] > mass_vector[length(mass_vector)] || + fit_values$mean[4] < mass_vector[1] || fit_values$mean[4] > mass_vector[length(mass_vector)]) { + # check if quality of fit is bad + if (fit_values$qual > fit_quality) { + # run this function again with fixed boundaries + return(fit_gaussian(mass_vector2, mass_vector, int_vector, max_index, scale, resol, + outdir, force, use_bounds = TRUE, + plot, scanmode, int_factor, width, height)) + + } else { + # check which mean is outside range and remove it from the list of means + # NB: peak_mean and other variables have not been given values from 2-peak fit yet! + for (i in 1:length(fit_values$mean)) { + if (fit_values$mean[i] < mass_vector[1] | fit_values$mean[i] > mass_vector[length(mass_vector)]) { + peak_mean <- c(peak_mean, -i) + peak_area <- c(peak_area, -i) + } else { + peak_mean <- c(peak_mean, fit_values$mean[i]) + peak_area <- c(peak_area, fit_values$area[i]) + } + } + peak_qual <- fit_values$qual + peak_min <- mass_vector[1] + peak_max <- mass_vector[length(mass_vector)] + } + # if all means are within range + } else { + # check for bad fit + if (fit_values$qual > fit_quality) { + # Try to fit 1 curve, force = 5 + return(fit_gaussian(mass_vector2, mass_vector, int_vector, max_index, scale, resol, + outdir, force = 5, use_bounds = FALSE, + plot, scanmode, int_factor, width, height)) + # good fit, all means are within m/z range + } else { + # check if means are within 4 ppm and sum if so + tmp <- fit_values$qual + nr_means_new <- -1 + nr_means <- length(fit_values$mean) + while (nr_means != nr_means_new) { + nr_means <- length(fit_values$mean) + fit_values <- within_ppm(fit_values$mean, fit_values$scale, fit_values$sigma, fit_values$area, + mass_vector2, mass_vector, ppm = 4, resol, plot) + nr_means_new <- length(fit_values$mean) + } + # restore original quality score + fit_values$qual <- tmp + + for (i in 1:length(fit_values$mean)){ + peak_mean <- c(peak_mean, fit_values$mean[i]) + peak_area <- c(peak_area, fit_values$area[i]) + } + peak_qual <- fit_values$qual + peak_min <- mass_vector[1] + peak_max <- mass_vector[length(mass_vector)] + + } + } + + # More than four local maxima; fit 1 peak. + } else { + scale <- 2 + fit_quality1 <- 0.40 + use_bounds <- TRUE + max_index <- which(int_vector == max(int_vector)) + fit_values <- fit_1peak(mass_vector2, mass_vector, int_vector, max_index, scale, resol, + plot, fit_quality1, use_bounds) + # check for bad fit + if (fit_values$qual > fit_quality1) { + # remove + if (plot) dev.off() + # get fit values from fit_optim + fit_values <- fit_optim(mass_vector, int_vector, resol, plot, scanmode, int_factor, width, height) + peak_mean <- c(peak_mean, fit_values$mean) + peak_area <- c(peak_area, fit_values$area) + peak_min <- fit_values$min + peak_max <- fit_values$max + peak_qual <- 0 + } else { + peak_mean <- c(peak_mean, fit_values$mean) + peak_area <- c(peak_area, estimate_area(fit_values$mean, resol, fit_values$scale, fit_values$sigma, int_factor)) + peak_qual <- fit_values$qual + peak_min <- mass_vector[1] + peak_max <- mass_vector[length(mass_vector)] + } + } + + # put all values for this region of interest into a list + roi_value_list <- list("mean" = peak_mean, + "area" = peak_area, + "qual" = peak_qual, + "min" = peak_min, + "max" = peak_max) + return(roi_value_list) +} diff --git a/DIMS/Utils/fit_gaussians.R b/DIMS/Utils/fit_gaussians.R new file mode 100644 index 0000000..05bff86 --- /dev/null +++ b/DIMS/Utils/fit_gaussians.R @@ -0,0 +1,196 @@ +# Gaussian fit functions +## adapted from fitG.R, fit2G.R, fit3G.R and fit4G.R (combined) +fit_1gaussian <- function(mass_vector, int_vector, sigma, query_mass, scale, use_bounds) { + #' Fit a Gaussian curve for a peak with given parameters + #' + #' @param mass_vector: Vector of masses (float) + #' @param int_vector: Vector of intensities (float) + #' @param sigma: Value for width of the peak (float) + #' @param query_mass: Value for mass at center of peak (float) + #' @param scale: Value for scaling intensities (float) + #' @param use_bounds: Boolean to indicate whether boundaries are to be used + #' + #' @return opt_fit: list of parameters and values describing the optimal fit + + # define optimization function for optim based on normal distribution + opt_f <- function(params) { + d <- params[2] * dnorm(mass_vector, mean = params[1], sd = sigma) + sum((d - int_vector) ^ 2) + } + if (use_bounds) { + # determine lower and upper boundaries + lower <- c(mass_vector[1], 0, mass_vector[1], 0) + upper <- c(mass_vector[length(mass_vector)], Inf, mass_vector[length(mass_vector)], Inf) + # get optimal value for fitted Gaussian curve + opt_fit <- optim(c(as.numeric(query_mass), as.numeric(scale)), + opt_f, control = list(maxit = 10000), method = "L-BFGS-B", + lower = lower, upper = upper) + } else { + opt_fit <- optim(c(as.numeric(query_mass), as.numeric(scale)), + opt_f, control = list(maxit = 10000)) + } + return(opt_fit) +} + + +fit_2gaussians <- function(mass_vector, int_vector, sigma1, sigma2, + query_mass1, scale1, + query_mass2, scale2, use_bounds) { + #' Fit two Gaussian curves for a peak with given parameters + #' + #' @param mass_vector: Vector of masses (float) + #' @param int_vector: Vector of intensities (float) + #' @param sigma1: Value for width of the first peak (float) + #' @param sigma2: Value for width of the second peak (float) + #' @param query_mass1: Value for mass at center of first peak (float) + #' @param scale1: Value for scaling intensities for first peak (float) + #' @param query_mass2: Value for mass at center of second peak (float) + #' @param scale2: Value for scaling intensities for second peak (float) + #' @param use_bounds: Boolean to indicate whether boundaries are to be used + #' + #' @return opt_fit: list of parameters and values describing the optimal fit + + # define optimization function for optim based on normal distribution + opt_f <- function(params) { + d <- params[2] * dnorm(mass_vector, mean = params[1], sd = sigma1) + + params[4] * dnorm(mass_vector, mean = params[3], sd = sigma2) + sum((d - int_vector) ^ 2) + } + + if (use_bounds) { + # determine lower and upper boundaries + lower <- c(mass_vector[1], 0, mass_vector[1], 0) + upper <- c(mass_vector[length(mass_vector)], Inf, mass_vector[length(mass_vector)], Inf) + # get optimal value for 2 fitted Gaussian curves + if (is.null(query_mass2) && is.null(scale2) && is.null(sigma2)) { + sigma2 <- sigma1 + opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale1), + as.numeric(query_mass1), as.numeric(scale1)), + opt_f, control = list(maxit = 10000), + method = "L-BFGS-B", lower = lower, upper = upper) + } else { + opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale1), + as.numeric(query_mass2), as.numeric(scale2)), + opt_f, control = list(maxit = 10000), + method = "L-BFGS-B", lower = lower, upper = upper) + } + } else { + if (is.null(query_mass2) && is.null(scale2) && is.null(sigma2)) { + sigma2 <- sigma1 + opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale1), + as.numeric(query_mass1), as.numeric(scale1)), + opt_f, control = list(maxit = 10000)) + } else { + opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale1), + as.numeric(query_mass2), as.numeric(scale2)), + opt_f, control = list(maxit = 10000)) + } + } + return(opt_fit) +} + + +fit_3gaussians <- function(mass_vector, int_vector, sigma1, sigma2, sigma3, + query_mass1, scale1, + query_mass2, scale2, + query_mass3, scale3, use_bounds) { + #' Fit three Gaussian curves for a peak with given parameters + #' + #' @param mass_vector: Vector of masses (float) + #' @param int_vector: Vector of intensities (float) + #' @param sigma1: Value for width of the first peak (float) + #' @param sigma2: Value for width of the second peak (float) + #' @param sigma3: Value for width of the third peak (float) + #' @param query_mass1: Value for mass at center of first peak (float) + #' @param scale1: Value for scaling intensities for first peak (float) + #' @param query_mass2: Value for mass at center of second peak (float) + #' @param scale2: Value for scaling intensities for second peak (float) + #' @param query_mass3: Value for mass at center of third peak (float) + #' @param scale3: Value for scaling intensities for third peak (float) + #' @param use_bounds: Boolean to indicate whether boundaries are to be used + #' + #' @return opt_fit: list of parameters and values describing the optimal fit + + # define optimization function for optim based on normal distribution + opt_f <- function(params) { + d <- params[2] * dnorm(mass_vector, mean = params[1], sd = sigma1) + + params[4] * dnorm(mass_vector, mean = params[3], sd = sigma2) + + params[6] * dnorm(mass_vector, mean = params[5], sd = sigma3) + sum((d - int_vector) ^ 2) + } + + if (use_bounds) { + # determine lower and upper boundaries + lower <- c(mass_vector[1], 0, mass_vector[1], 0, mass_vector[1], 0) + upper <- c(mass_vector[length(mass_vector)], Inf, mass_vector[length(mass_vector)], Inf, + mass_vector[length(mass_vector)], Inf) + # get optimal value for 3 fitted Gaussian curves + opt_fit <- optim(c(query_mass1, scale1, + query_mass2, scale2, + query_mass3, scale3), + opt_f, control = list(maxit = 10000), + method = "L-BFGS-B", lower = lower, upper = upper) + } else { + opt_fit <- optim(c(query_mass1, scale1, + query_mass2, scale2, + query_mass3, scale3), + opt_f, control = list(maxit = 10000)) + } + return(opt_fit) +} + +fit_4gaussians <- function(mass_vector, int_vector, sigma1, sigma2, sigma3, sigma4, + query_mass1, scale1, + query_mass2, scale2, + query_mass3, scale3, + query_mass4, scale4, use_bounds) { + #' Fit four Gaussian curves for a peak with given parameters + #' + #' @param mass_vector: Vector of masses (float) + #' @param int_vector: Vector of intensities (float) + #' @param sigma1: Value for width of the first peak (float) + #' @param sigma2: Value for width of the second peak (float) + #' @param sigma3: Value for width of the third peak (float) + #' @param sigma4: Value for width of the fourth peak (float) + #' @param query_mass1: Value for mass at center of first peak (float) + #' @param scale1: Value for scaling intensities for first peak (float) + #' @param query_mass2: Value for mass at center of second peak (float) + #' @param scale2: Value for scaling intensities for second peak (float) + #' @param query_mass3: Value for mass at center of third peak (float) + #' @param scale3: Value for scaling intensities for third peak (float) + #' @param query_mass4: Value for mass at center of fourth peak (float) + #' @param scale4: Value for scaling intensities for fourth peak (float) + #' @param use_bounds: Boolean to indicate whether boundaries are to be used + #' + #' @return opt_fit: list of parameters and values describing the optimal fit + + # define optimization function for optim based on normal distribution + opt_f <- function(params) { + d <- params[2] * dnorm(mass_vector, mean = params[1], sd = sigma1) + + params[4] * dnorm(mass_vector, mean = params[3], sd = sigma2) + + params[6] * dnorm(mass_vector, mean = params[5], sd = sigma3) + + params[8] * dnorm(mass_vector, mean = params[7], sd = sigma4) + sum((d - int_vector) ^ 2) + } + + if (use_bounds) { + # determine lower and upper boundaries + lower <- c(mass_vector[1], 0, mass_vector[1], 0, mass_vector[1], 0, mass_vector[1], 0) + upper <- c(mass_vector[length(mass_vector)], Inf, mass_vector[length(mass_vector)], Inf, + mass_vector[length(mass_vector)], Inf, mass_vector[length(mass_vector)], Inf) + # get optimal value for 3 fitted Gaussian curves + opt_fit <- optim(c(query_mass1, scale1, + query_mass2, scale2, + query_mass3, scale3, + query_mass4, scale4), + opt_f, control = list(maxit = 10000), + method = "L-BFGS-B", lower = lower, upper = upper) + } else { + opt_fit <- optim(c(query_mass1, scale1, + query_mass2, scale2, + query_mass3, scale3, + query_mass4, scale4), + opt_f, control = list(maxit = 10000)) + } + return(opt_fit) +} diff --git a/DIMS/Utils/fit_init.R b/DIMS/Utils/fit_init.R new file mode 100644 index 0000000..ecae337 --- /dev/null +++ b/DIMS/Utils/fit_init.R @@ -0,0 +1,47 @@ +## adapted from fitGaussianInit.R +# variables with fixed values will be removed from function parameters +# scale, outdir, plot, width, height +# mz_index, start_index, end_index, sample_name not used. +# fit_gaussian should be defined before this function is called. +fit_init <- function(mass_vector, int_vector, int_factor, scale, resol, + outdir, sample_name, scanmode, plot, width, height, + mz_index, start_index, end_index) { + #' Determine initial fit of Gaussian curve to small region of m/z + #' + #' @param mass_vector: Vector of m/z values for a region of interest (float) + #' @param int_vector: Vector of intensities for a region of interest (float) + #' @param int_factor: Value used to calculate area under Gaussian curve (integer) + #' @param scale: Initial value used to estimate scaling parameter (integer) + #' @param resol: Value for resolution (integer) + #' @param outdir: Path for output directory (string) + #' @param sample_name: Sample name (string) + #' @param scanmode: Scan mode, positive or negative (string) + #' @param plot: Parameter indicating whether plots should be made (boolean) + #' @param width: Value for width of plot (integer) + #' @param height: Value for height of plot (integer) + #' @param mz_index: Index of m/z value with non-zero intensity (integer) + #' @param start_index: Index of start of m/z range in mass_vector (integer) + #' @param end_index: Index of end of m/z range in mass_vector (integer) + #' + #' @return roi_value_list: list of fit values for region of interest (list) + + # define mass_diff as difference between last and first value of mass_vector + mass_diff <- mass_vector[length(mass_vector)] - mass_vector[1] + # generate a second mass_vector with equally spaced m/z values + mass_vector2 <- seq(mass_vector[1], mass_vector[length(mass_vector)], + length = mass_diff * int_factor) + + # Find the index in int_vector with the highest intensity + max_index <- which(int_vector == max(int_vector)) + roi_values <- fit_gaussian(mass_vector2, mass_vector, int_vector, max_index, + scale, resol, outdir, force = length(max_index), + use_bounds = FALSE, plot, scanmode, int_factor, width, height) + # put all values for this region of interest into a list + roi_value_list <- list("mean" = roi_values$mean, + "area" = roi_values$area, + "qual" = roi_values$qual, + "min" = roi_values$min, + "max" = roi_values$max) + return(roi_value_list) +} + diff --git a/DIMS/Utils/fit_optim.R b/DIMS/Utils/fit_optim.R new file mode 100644 index 0000000..404cc7a --- /dev/null +++ b/DIMS/Utils/fit_optim.R @@ -0,0 +1,47 @@ +## adapted from generateGaussian.R +# variables with fixed values will be removed from function parameters +# plot, width, height +# fit_gaussian should be defined before this function is called. +fit_optim <- function(mass_vector, int_vector, resol, + plot, scanmode, int_factor, width, height) { + #' Determine optimized fit of Gaussian curve to small region of m/z + #' + #' @param mass_vector: Vector of m/z values for a region of interest (float) + #' @param int_vector: Vector of intensities for a region of interest (float) + #' @param resol: Value for resolution (integer) + #' @param plot: Parameter indicating whether plots should be made (boolean) + #' @param scanmode: Scan mode, positive or negative (string) + #' @param int_factor: Value used to calculate area under Gaussian curve (integer) + #' @param width: Value for width of plot (integer) + #' @param height: Value for height of plot (integer) + #' + #' @return roi_value_list: list of fit values for region of interest (list) + + factor <- 1.5 + # Find the index in int_vector with the highest intensity + max_index <- which(int_vector == max(int_vector))[1] + mass_max <- mass_vector[max_index] + int_max <- int_vector[max_index] + # get peak width + fwhm <- get_fwhm(mass_max, resol) + # simplify the peak shape: represent it by a triangle + mass_max_simple <- c(mass_max - factor * fwhm, mass_max, mass_max + factor * fwhm) + int_max_simple <- c(0, int_max, 0) + + # define mass_diff as difference between last and first value of mass_max_simple + mass_diff <- mass_max_simple[length(mass_max_simple)] - mass_max_simple[1] + # generate a second mass_vector with equally spaced m/z values + mass_vector2 <- seq(mass_max_simple[1], mass_max_simple[length(mass_max_simple)], + length = mass_diff * int_factor) + sigma <- get_stdev(mass_vector2, int_max_simple) + scale <- optimize_gaussfit(mass_vector2, int_max_simple, sigma, mass_max) + + # get an estimate of the area under the peak + area <- estimate_area(mass_max, resol, scale, sigma, int_factor) + # put all values for this region of interest into a list + roi_value_list <- list("mean" = mass_max, + "area" = area, + "min" = mass_vector2[1], + "max" = mass_vector2[length(mass_vector2)]) + return(roi_value_list) +} diff --git a/DIMS/Utils/fit_peaks.R b/DIMS/Utils/fit_peaks.R new file mode 100644 index 0000000..02d5408 --- /dev/null +++ b/DIMS/Utils/fit_peaks.R @@ -0,0 +1,381 @@ +## adapted from fit1Peak.R, fit2peaks.R, fit3peaks.R and fit4peaks.R (combined) +# variables with fixed values will be removed from function parameters +# plot, int_factor +fit_1peak <- function(mass_vector2, mass_vector, int_vector, max_index, scale, resol, plot, fit_quality, use_bounds) { + #' Fit 1 Gaussian peak in small region of m/z + #' + #' @param mass_vector2: Vector of equally spaced m/z values (float) + #' @param mass_vector: Vector of m/z values for a region of interest (float) + #' @param int_vector: Value used to calculate area under Gaussian curve (integer) + #' @param max_index: Index in int_vector with the highest intensity (integer) + #' @param scale: Initial value used to estimate scaling parameter (integer) + #' @param resol: Value for resolution (integer) + #' @param plot: Parameter indicating whether plots should be made (boolean) + #' @param fit_quality: Value indicating quality of fit of Gaussian curve (float) + #' @param use_bounds: Boolean to indicate whether boundaries are to be used + #' + #' @return roi_value_list: list of fit values for region of interest (list) + + if (length(int_vector) < 3) { + message("Range too small, no fit possible!") + } else { + if ((length(int_vector) == 4)) { + # fit 1 peak + mu <- weighted.mean(mass_vector, int_vector) + sigma <- get_stdev(mass_vector, int_vector) + fitted_peak <- fit_1gaussian(mass_vector, int_vector, sigma, mu, scale, use_bounds) + } else { + # set range vector + if ((length(mass_vector) - length(max_index)) < 2) { + range1 <- c((length(mass_vector) - 4) : length(mass_vector)) + } else if (length(max_index) < 2) { + range1 <- c(1:5) + } else { + range1 <- c(max_index[1] - 2, max_index[1] - 1, max_index[1], max_index[1] + 1, max_index[1] + 2) + } + if (range1[1] == 0) range1 <- range1[-1] + # remove NA + if (length(which(is.na(int_vector[range1]))) != 0) { + range1 <- range1[-which(is.na(int_vector[range1]))] + } + # fit 1 peak + mu <- weighted.mean(mass_vector[range1], int_vector[range1]) + sigma <- get_stdev(mass_vector[range1], int_vector[range1]) + fitted_peak <- fit_1gaussian(mass_vector, int_vector, sigma, mu, scale, use_bounds) + } + + p1 <- fitted_peak$par + + # get new value for fit quality and scale + fq_new <- get_fit_quality(mass_vector, int_vector, p1[1], p1[1], resol, p1[2], sigma)$fq_new + scale_new <- 1.2 * scale + + # bad fit + if (fq_new > fit_quality) { + # optimize scaling factor + fq <- 0 + scale <- 0 + if (sum(int_vector) > sum(p1[2] * dnorm(mass_vector, p1[1], sigma))) { + while ((round(fq, digits = 3) != round(fq_new, digits = 3)) && (scale_new < 10000)) { + fq <- fq_new + scale <- scale_new + # fit 1 peak + fitted_peak <- fit_1gaussian(mass_vector, int_vector, sigma, mu, scale, use_bounds) + p1 <- fitted_peak$par + # get new value for fit quality and scale + fq_new <- get_fit_quality(mass_vector, int_vector, p1[1], p1[1], resol, p1[2], sigma)$fq_new + scale_new <- 1.2 * scale + } + } else { + while ((round(fq, digits = 3) != round(fq_new, digits = 3)) && (scale_new < 10000)) { + fq <- fq_new + scale <- scale_new + # fit 1 peak + fitted_peak <- fit_1gaussian(mass_vector, int_vector, sigma, mu, scale, use_bounds) + p1 <- fitted_peak$par + # get new value for fit quality and scale + fq_new <- get_fit_quality(mass_vector, int_vector, p1[1], p1[1], resol, p1[2], sigma)$fq_new + scale_new <- 0.8 * scale + } + } + # use optimized scale factor to fit 1 peak + if (fq < fq_new) { + fitted_peak <- fit_1gaussian(mass_vector, int_vector, sigma, mu, scale, use_bounds) + p1 <- fitted_peak$par + fq_new <- fq + } + } + } + + roi_value_list <- list("mean" = p1[1], "scale" = p1[2], "sigma" = sigma, "qual" = fq_new) + return(roi_value_list) +} + +fit_2peaks <- function(mass_vector2, mass_vector, int_vector, max_index, scale, resol, use_bounds = FALSE, + plot = FALSE, fit_quality, int_factor) { + #' Fit 2 Gaussian peaks in small region of m/z + #' + #' @param mass_vector2: Vector of equally spaced m/z values (float) + #' @param mass_vector: Vector of m/z values for a region of interest (float) + #' @param int_vector: Value used to calculate area under Gaussian curve (integer) + #' @param max_index: Index in int_vector with the highest intensity (integer) + #' @param scale: Initial value used to estimate scaling parameter (integer) + #' @param resol: Value for resolution (integer) + #' @param plot: Parameter indicating whether plots should be made (boolean) + #' @param fit_quality: Value indicating quality of fit of Gaussian curve (float) + #' @param use_bounds: Boolean to indicate whether boundaries are to be used + #' @param int_factor: Value used to calculate area under Gaussian curve (integer) + #' + #' @return roi_value_list: list of fit values for region of interest (list) + + peak_mean <- NULL + peak_area <- NULL + peak_scale <- NULL + peak_sigma <- NULL + + # set range vectors for 2 peaks + range1 <- c(max_index[1] - 2, max_index[1] - 1, max_index[1], max_index[1] + 1, max_index[1] + 2) + if (range1[1] == 0) range1 <- range1[-1] + range2 <- c(max_index[2] - 2, max_index[2] - 1, max_index[2], max_index[2] + 1, max_index[2] + 2) + if (length(mass_vector) < range2[length(range2)]) range2 <- range2[-length(range2)] + range1 <- check_overlap(range1, range2)[[1]] + range2 <- check_overlap(range1, range2)[[2]] + # check for negative or 0 + remove <- which(range1 < 1) + if (length(remove) > 0) range1 <- range1[-remove] + remove <- which(range2 < 1) + if (length(remove) > 0) range2 <- range2[-remove] + # remove NA + if (length(which(is.na(int_vector[range1]))) != 0) range1 <- range1[-which(is.na(int_vector[range1]))] + if (length(which(is.na(int_vector[range2]))) != 0) range2 <- range2[-which(is.na(int_vector[range2]))] + + # fit 2 peaks, first separately, then together + mu1 <- weighted.mean(mass_vector[range1], int_vector[range1]) + sigma1 <- get_stdev(mass_vector[range1], int_vector[range1]) + fitted_peak <- fit_1gaussian(mass_vector[range1], int_vector[range1], sigma1, mu1, scale, use_bounds) + p1 <- fitted_peak$par + # second peak + mu2 <- weighted.mean(mass_vector[range2], int_vector[range2]) + sigma2 <- get_stdev(mass_vector[range2], int_vector[range2]) + fitted_peak <- fit_1gaussian(mass_vector[range2], int_vector[range2], sigma2, mu2, scale, use_bounds) + p2 <- fitted_peak$par + # combined + fitted_2peaks <- fit_2gaussians(mass_vector, int_vector, sigma1, sigma2, p1[1], p1[2], p2[1], p2[2], use_bounds) + pc <- fitted_2peaks$par + + # get fit quality + if (is.null(sigma2)) sigma2 <- sigma1 + sum_fit <- (pc[2] * dnorm(mass_vector, pc[1], sigma1)) + + (pc[4] * dnorm(mass_vector, pc[3], sigma2)) + fq <- get_fit_quality(mass_vector, int_vector, sort(c(pc[1], pc[3]))[1], sort(c(pc[1], pc[3]))[2], + resol, sum_fit = sum_fit)$fq_new + + # get parameter values + area1 <- estimate_area(pc[1], resol, pc[2], sigma1, int_factor) + area2 <- estimate_area(pc[3], resol, pc[4], sigma2, int_factor) + peak_area <- c(peak_area, area1) + peak_area <- c(peak_area, area2) + peak_mean <- c(peak_mean, pc[1]) + peak_mean <- c(peak_mean, pc[3]) + peak_scale <- c(peak_scale, pc[2]) + peak_scale <- c(peak_scale, pc[4]) + peak_sigma <- c(peak_sigma, sigma1) + peak_sigma <- c(peak_sigma, sigma2) + + roi_value_list <- list("mean" = peak_mean, "scale" = peak_scale, "sigma" = peak_sigma, "area" = peak_area, "qual" = fq) + return(roi_value_list) +} + +fit_3peaks <- function(mass_vector2, mass_vector, int_vector, max_index, scale, resol, use_bounds = FALSE, + plot = FALSE, fit_quality, int_factor) { + #' Fit 3 Gaussian peaks in small region of m/z + #' + #' @param mass_vector2: Vector of equally spaced m/z values (float) + #' @param mass_vector: Vector of m/z values for a region of interest (float) + #' @param int_vector: Value used to calculate area under Gaussian curve (integer) + #' @param max_index: Index in int_vector with the highest intensity (integer) + #' @param scale: Initial value used to estimate scaling parameter (integer) + #' @param resol: Value for resolution (integer) + #' @param plot: Parameter indicating whether plots should be made (boolean) + #' @param fit_quality: Value indicating quality of fit of Gaussian curve (float) + #' @param use_bounds: Boolean to indicate whether boundaries are to be used + #' @param int_factor: Value used to calculate area under Gaussian curve (integer) + #' + #' @return roi_value_list: list of fit values for region of interest (list) + + peak_mean <- NULL + peak_area <- NULL + peak_scale <- NULL + peak_sigma <- NULL + + # set range vectors for 3 peaks + range1 <- c(max_index[1] - 2, max_index[1] - 1, max_index[1], max_index[1] + 1, max_index[1] + 2) + range2 <- c(max_index[2] - 2, max_index[2] - 1, max_index[2], max_index[2] + 1, max_index[2] + 2) + range3 <- c(max_index[3] - 2, max_index[3] - 1, max_index[3], max_index[3] + 1, max_index[3] + 2) + remove <- which(range1 < 1) + if (length(remove) > 0) { + range1 <- range1[-remove] + } + remove <- which(range2 < 1) + if (length(remove) > 0) { + range2 <- range2[-remove] + } + if (length(mass_vector) < range3[length(range3)]) range3 <- range3[-length(range3)] + range1 <- check_overlap(range1, range2)[[1]] + range2 <- check_overlap(range1, range2)[[2]] + range2 <- check_overlap(range2, range3)[[1]] + range3 <- check_overlap(range2, range3)[[2]] + # check for negative or 0 + remove <- which(range1 < 1) + if (length(remove) > 0) range1 <- range1[-remove] + remove <- which(range2 < 1) + if (length(remove) > 0) range2 <- range2[-remove] + remove <- which(range3 < 1) + if (length(remove) > 0) range3 <- range3[-remove] + # remove NA + if (length(which(is.na(int_vector[range1]))) != 0) range1 <- range1[-which(is.na(int_vector[range1]))] + if (length(which(is.na(int_vector[range2]))) != 0) range2 <- range2[-which(is.na(int_vector[range2]))] + if (length(which(is.na(int_vector[range3]))) != 0) range3 <- range3[-which(is.na(int_vector[range3]))] + + # fit 3 peaks, first separately, then together + mu1 <- weighted.mean(mass_vector[range1], int_vector[range1]) + sigma1 <- get_stdev(mass_vector[range1], int_vector[range1]) + fitted_peak <- fit_1gaussian(mass_vector[range1], int_vector[range1], sigma1, mu1, scale, use_bounds) + p1 <- fitted_peak$par + # second peak + mu2 <- weighted.mean(mass_vector[range2], int_vector[range2]) + sigma2 <- get_stdev(mass_vector[range2], int_vector[range2]) + fitted_peak <- fit_1gaussian(mass_vector[range2], int_vector[range2], sigma2, mu2, scale, use_bounds) + p2 <- fitted_peak$par + # third peak + mu3 <- weighted.mean(mass_vector[range3], int_vector[range3]) + sigma3 <- get_stdev(mass_vector[range3], int_vector[range3]) + fitted_peak <- fit_1gaussian(mass_vector[range3], int_vector[range3], sigma3, mu3, scale, use_bounds) + p3 <- fitted_peak$par + # combined + fitted_3peaks <- fit_3gaussians(mass_vector, int_vector, sigma1, sigma2, sigma3, + p1[1], p1[2], p2[1], p2[2], p3[1], p3[2], use_bounds) + pc <- fitted_3peaks$par + + # get fit quality + sum_fit = (pc[2] * dnorm(mass_vector, pc[1], sigma1)) + + (pc[4] * dnorm(mass_vector, pc[3], sigma2)) + + (pc[6] * dnorm(mass_vector, pc[5], sigma3)) + fq <- get_fit_quality(mass_vector, int_vector, sort(c(pc[1], pc[3], pc[5]))[1], sort(c(pc[1], pc[3], pc[5]))[3], + resol, sum_fit = sum_fit)$fq_new + + # get parameter values + area1 <- estimate_area(pc[1], resol, pc[2], sigma1, int_factor) + area2 <- estimate_area(pc[3], resol, pc[4], sigma2, int_factor) + area3 <- estimate_area(pc[5], resol, pc[6], sigma3, int_factor) + peak_area <- c(peak_area, area1) + peak_area <- c(peak_area, area2) + peak_area <- c(peak_area, area3) + peak_mean <- c(peak_mean, pc[1]) + peak_mean <- c(peak_mean, pc[3]) + peak_mean <- c(peak_mean, pc[5]) + peak_scale <- c(peak_scale, pc[2]) + peak_scale <- c(peak_scale, pc[4]) + peak_scale <- c(peak_scale, pc[6]) + peak_sigma <- c(peak_sigma, sigma1) + peak_sigma <- c(peak_sigma, sigma2) + peak_sigma <- c(peak_sigma, sigma3) + + roi_value_list <- list("mean" = peak_mean, "scale" = peak_scale, "sigma" = peak_sigma, "area" = peak_area, "qual" = fq) + return(roi_value_list) +} + +fit_4peaks <- function(mass_vector2, mass_vector, int_vector, max_index, scale, resol, use_bounds = FALSE, + plot = FALSE, fit_quality, int_factor) { + #' Fit 4 Gaussian peaks in small region of m/z + #' + #' @param mass_vector2: Vector of equally spaced m/z values (float) + #' @param mass_vector: Vector of m/z values for a region of interest (float) + #' @param int_vector: Value used to calculate area under Gaussian curve (integer) + #' @param max_index: Index in int_vector with the highest intensity (integer) + #' @param scale: Initial value used to estimate scaling parameter (integer) + #' @param resol: Value for resolution (integer) + #' @param plot: Parameter indicating whether plots should be made (boolean) + #' @param fit_quality: Value indicating quality of fit of Gaussian curve (float) + #' @param use_bounds: Boolean to indicate whether boundaries are to be used + #' @param int_factor: Value used to calculate area under Gaussian curve (integer) + #' + #' @return roi_value_list: list of fit values for region of interest (list) + + peak_mean <- NULL + peak_area <- NULL + peak_scale <- NULL + peak_sigma <- NULL + + # set range vectors for 4 peaks + range1 <- c(max_index[1] - 2, max_index[1] - 1, max_index[1], max_index[1] + 1, max_index[1] + 2) + range2 <- c(max_index[2] - 2, max_index[2] - 1, max_index[2], max_index[2] + 1, max_index[2] + 2) + range3 <- c(max_index[3] - 2, max_index[3] - 1, max_index[3], max_index[3] + 1, max_index[3] + 2) + range4 <- c(max_index[4] - 2, max_index[4] - 1, max_index[4], max_index[4] + 1, max_index[4] + 2) + if (range1[1] == 0) range1 <- range1[-1] + if (length(mass_vector) < range4[length(range4)]) range4 <- range4[-length(range4)] + range1 <- check_overlap(range1, range2)[[1]] + range2 <- check_overlap(range1, range2)[[2]] + range2 <- check_overlap(range2, range3)[[1]] + range3 <- check_overlap(range2, range3)[[2]] + range3 <- check_overlap(range3, range4)[[1]] + range4 <- check_overlap(range3, range4)[[2]] + remove <- which(range4 > length(mass_vector)) + if (length(remove) > 0) { + range4 <- range4[-remove] + } + # check for negative or 0 + remove <- which(range1 < 1) + if (length(remove) > 0) range1 <- range1[-remove] + remove <- which(range2 < 1) + if (length(remove) > 0) range2 <- range2[-remove] + remove <- which(range3 < 1) + if (length(remove) > 0) range3 <- range3[-remove] + remove <- which(range4 < 1) + if (length(remove) > 0) range4 <- range4[-remove] + # remove NA + if (length(which(is.na(int_vector[range1]))) != 0) range1 <- range1[-which(is.na(int_vector[range1]))] + if (length(which(is.na(int_vector[range2]))) != 0) range2 <- range2[-which(is.na(int_vector[range2]))] + if (length(which(is.na(int_vector[range3]))) != 0) range3 <- range3[-which(is.na(int_vector[range3]))] + if (length(which(is.na(int_vector[range4]))) != 0) range4 <- range4[-which(is.na(int_vector[range4]))] + + # fit 4 peaks, first separately, then together + mu1 <- weighted.mean(mass_vector[range1], int_vector[range1]) + sigma1 <- get_stdev(mass_vector[range1], int_vector[range1]) + fitted_peak <- fit_1gaussian(mass_vector[range1], int_vector[range1], sigma1, mu1, scale, use_bounds) + p1 <- fitted_peak$par + # second peak + mu2 <- weighted.mean(mass_vector[range2], int_vector[range2]) + sigma2 <- get_stdev(mass_vector[range2], int_vector[range2]) + fitted_peak <- fit_1gaussian(mass_vector[range2], int_vector[range2], sigma2, mu2, scale, use_bounds) + p2 <- fitted_peak$par + # third peak + mu3 <- weighted.mean(mass_vector[range3], int_vector[range3]) + sigma3 <- get_stdev(mass_vector[range3], int_vector[range3]) + fitted_peak <- fit_1gaussian(mass_vector[range3], int_vector[range3], sigma3, mu3, scale, use_bounds) + p3 <- fitted_peak$par + # fourth peak + mu4 <- weighted.mean(mass_vector[range4], int_vector[range4]) + sigma4 <- get_stdev(mass_vector[range4], int_vector[range4]) + fitted_peak <- fit_1gaussian(mass_vector[range4], int_vector[range4], sigma4, mu4, scale, use_bounds) + p4 <- fitted_peak$par + # combined + fitted_4peaks <- fit_4gaussians(mass_vector, int_vector, sigma1, sigma2, sigma3, sigma3, + p1[1], p1[2], p2[1], p2[2], p3[1], p3[2], p4[1], p4[2], use_bounds) + pc <- fitted_4peaks$par + + # get fit quality + sum_fit <- (pc[2] * dnorm(mass_vector, pc[1], sigma1)) + + (pc[4] * dnorm(mass_vector, pc[3], sigma2)) + + (pc[6] * dnorm(mass_vector, pc[5], sigma3)) + + (pc[8] * dnorm(mass_vector, pc[7], sigma3)) + fq <- get_fit_quality(mass_vector, int_vector, + sort(c(pc[1], pc[3], pc[5], pc[7]))[1], sort(c(pc[1], pc[3], pc[5], pc[7]))[4], + resol, sum_fit = sum_fit)$fq_new + + # get parameter values + area1 <- estimate_area(pc[1], resol, pc[2], sigma1, int_factor) + area2 <- estimate_area(pc[3], resol, pc[4], sigma2, int_factor) + area3 <- estimate_area(pc[5], resol, pc[6], sigma3, int_factor) + area4 <- estimate_area(pc[7], resol, pc[8], sigma4, int_factor) + peak_area <- c(peak_area, area1) + peak_area <- c(peak_area, area2) + peak_area <- c(peak_area, area3) + peak_area <- c(peak_area, area4) + peak_mean <- c(peak_mean, pc[1]) + peak_mean <- c(peak_mean, pc[3]) + peak_mean <- c(peak_mean, pc[5]) + peak_mean <- c(peak_mean, pc[7]) + peak_scale <- c(peak_scale, pc[2]) + peak_scale <- c(peak_scale, pc[4]) + peak_scale <- c(peak_scale, pc[6]) + peak_scale <- c(peak_scale, pc[8]) + peak_sigma <- c(peak_sigma, sigma1) + peak_sigma <- c(peak_sigma, sigma2) + peak_sigma <- c(peak_sigma, sigma3) + peak_sigma <- c(peak_sigma, sigma4) + + roi_value_list <- list("mean" = peak_mean, "scale" = peak_scale, "sigma" = peak_sigma, "area" = peak_area, "qual" = fq) + return(roi_value_list) +} + diff --git a/DIMS/Utils/get_element_info.R b/DIMS/Utils/get_element_info.R new file mode 100644 index 0000000..a619f6a --- /dev/null +++ b/DIMS/Utils/get_element_info.R @@ -0,0 +1,24 @@ +## adapted from elementInfo.R, which is adapted from Rdisop function .getElement +# refactor: check where library is initialised: library(Rdisop) +get_element_info <- function(name, elements = NULL) { + #' Get info on m/z and isotopes for all chemical elements + #' + #' @param name: Name of adduct, e.g. Na (string) + #' @param elements: List of all adducts to take into account (list of strings) + #' + #' @return element_info: peak group list with filled-in intensities (matrix) + + # get information on all elements + if (!is.list(elements) || length(elements) == 0 ) { + elements <- initializePSE() + } + # extract information for a particular adduct + if (name == "CH3OH+H") { + # regular_expr should be exact match for name, except for methanol + regular_expr <- "^CH3OH\\+H$" + } else { + regular_expr <- paste0("^", name, "$") + } + element_info <- elements[[grep(regular_expr, sapply(elements, function(x) { x$name }))]] + return(element_info) +} \ No newline at end of file diff --git a/DIMS/Utils/get_fit_quality.R b/DIMS/Utils/get_fit_quality.R new file mode 100644 index 0000000..23b1947 --- /dev/null +++ b/DIMS/Utils/get_fit_quality.R @@ -0,0 +1,34 @@ +## adapted from getFitQuality.R +# parameter not used: mu_last +get_fit_quality <- function(mass_vector, int_vector, mu_first, mu_last, resol, scale = NULL, sigma = NULL, sum_fit = NULL) { + #' Fit 1 Gaussian peak in small region of m/z + #' + #' @param mass_vector: Vector of m/z values for a region of interest (float) + #' @param int_vector: Value used to calculate area under Gaussian curve (integer) + #' @param mu_first: Value for first peak (float) + #' @param scale: Initial value used to estimate scaling parameter (integer) + #' @param resol: Value for resolution (integer) + #' @param sum_fit: Value indicating quality of fit of Gaussian curve (float) + #' + #' @return list_params: list of parameters indicating quality of fit (list) + if (is.null(sum_fit)) { + mass_vector_int <- mass_vector + int_vector_int <- int_vector + # get new fit quality + fq_new <- mean(abs((scale * dnorm(mass_vector_int, mu_first, sigma)) - int_vector_int) / + rep((max(scale * dnorm(mass_vector_int, mu_first, sigma)) / 2), length(mass_vector_int))) + } else { + sum_fit_int <- sum_fit + int_vector_int <- int_vector + mass_vector_int <- mass_vector + # get new fit quality + fq_new <- mean(abs(sum_fit_int - int_vector_int) / rep(max(sum_fit_int) /2, length(sum_fit_int))) + } + + # Prevent division by 0 + if (is.nan(fq_new)) fq_new <- 1 + + list_params <- list("fq_new" = fq_new, "x_int" = mass_vector_int, "y_int" = int_vector_int) + return(list_params) +} + diff --git a/DIMS/Utils/get_fwhm.R b/DIMS/Utils/get_fwhm.R new file mode 100644 index 0000000..7543921 --- /dev/null +++ b/DIMS/Utils/get_fwhm.R @@ -0,0 +1,21 @@ +## adapted from getFwhm.R +get_fwhm <- function(query_mass, resol) { + #' Calculate fwhm (full width at half maximum intensity) for a peak + #' + #' @param query_mass: Value for mass (float) + #' @param resol: Value for resolution (integer) + #' + #' @return fwhm: Value for full width at half maximum (float) + + # set aberrant values of query_mass to zero + if (is.nan(query_mass)) query_mass <- 0 + if (is.na(query_mass)) query_mass <- 0 + if (is.null(query_mass)) query_mass <- 0 + if (query_mass < 0) query_mass <- 0 + # calculate resolution at given m/z value + resol_mz <- resol * (1 / sqrt(2) ^ (log2(query_mass / 200))) + # calculate full width at half maximum + fwhm <- query_mass / resol_mz + return(fwhm) +} + diff --git a/DIMS/Utils/get_patient_data_to_helix.R b/DIMS/Utils/get_patient_data_to_helix.R new file mode 100644 index 0000000..1eeaf7a --- /dev/null +++ b/DIMS/Utils/get_patient_data_to_helix.R @@ -0,0 +1,39 @@ +get_patient_data_to_helix <- function(metab_interest_sorted, metab_list_all) { + #' Get patient data to be uploaded to Helix + #' + #' @param metab_interest_sorted: list of dataframes with metabolite Z-scores for each sample/patient (list) + #' @param metab_list_all: list of tables with metabolites for Helix and violin plots (list) + #' + #' @return: dataframe with patient data with only metabolites for Helix and violin plots + #' with Helix name, high/low Z-score cutoffs + + # Combine Z-scores of metab groups together + df_all_metabs_zscores <- bind_rows(metab_interest_sorted) + # Change columnnames + colnames(df_all_metabs_zscores) <- c("HMDB_name", "Patient", "Z_score") + # Change Patient column to character instead of factor + df_all_metabs_zscores$Patient <- as.character(df_all_metabs_zscores$Patient) + + # Delete whitespaces HMDB_name + df_all_metabs_zscores$HMDB_name <- str_trim(df_all_metabs_zscores$HMDB_name, "right") + + # Split HMDB_name column on "nitine;" for match dims_helix_table + df_all_metabs_zscores$HMDB_name_split <- str_split_fixed(df_all_metabs_zscores$HMDB_name, "nitine;", 2)[, 1] + + # Combine stofgroepen + dims_helix_table <- bind_rows(metab_list_all) + # Filter table for metabolites for Helix + dims_helix_table <- dims_helix_table %>% filter(Helix == "ja") + # Split HMDB_name column on "nitine;" for match df_all_metabs_zscores + dims_helix_table$HMDB_name_split <- str_split_fixed(dims_helix_table$HMDB_name, "nitine;", 2)[, 1] + dims_helix_table <- dims_helix_table %>% select(HMDB_name_split, Helix_naam, high_zscore, low_zscore) + + # Filter DIMS results for metabolites for Helix + df_metabs_helix <- df_all_metabs_zscores %>% filter(HMDB_name_split %in% dims_helix_table$HMDB_name_split) + # Combine dims_helix_table and df_metabs_helix, adding Helix codes etc. + df_metabs_helix <- df_metabs_helix %>% left_join(dims_helix_table, by = join_by(HMDB_name_split)) + + df_metabs_helix <- df_metabs_helix %>% select(HMDB_name, Patient, Z_score, Helix_naam, high_zscore, low_zscore) + + return(df_metabs_helix) +} diff --git a/DIMS/Utils/get_stdev.R b/DIMS/Utils/get_stdev.R new file mode 100644 index 0000000..a385187 --- /dev/null +++ b/DIMS/Utils/get_stdev.R @@ -0,0 +1,22 @@ +## adapted from getSD.R +get_stdev <- function(mass_vector, int_vector, resol = 140000) { + #' Calculate standard deviation to determine width of a peak + #' + #' @param mass_vector: Vector of 3 mass values (float) + #' @param int_vector: Vector of 3 intensities (float) + #' @param resol: Value for resolution (integer) + #' + #' @return stdev: Value for standard deviation + # find maximum intensity in vector + max_index <- which(int_vector == max(int_vector)) + # find corresponding mass at maximum intensity + max_mass <- mass_vector[max_index] + # calculate resolution at given m/z value + resol_mz <- resol * (1 / sqrt(2) ^ (log2(max_mass / 200))) + # calculate full width at half maximum + fwhm <- max_mass / resol_mz + # calculate standard deviation + stdev <- (fwhm / 2) * 0.85 + return(stdev) +} + diff --git a/DIMS/Utils/identify_noisepeaks.R b/DIMS/Utils/identify_noisepeaks.R new file mode 100644 index 0000000..cf79279 --- /dev/null +++ b/DIMS/Utils/identify_noisepeaks.R @@ -0,0 +1,103 @@ +## adapted from ident.hires.noise.HPC +# refactor: remove variables slope, incpt, ppm_iso_fixed +# combine with function get_element_info +# modified identify function to also look for adducts and their isotopes +identify_noisepeaks <- function(peakgroup_list, all_adducts, scanmode = "Negative", look4 = c("Cl", "Ac"), + noise_mz = NULL, resol = 140000, slope = 0, incpt = 0, ppm_fixed, ppm_iso_fixed) { + #' Replace intensities that are zero with random value + #' + #' @param peakgroup_list: Peak group list (matrix) + #' @param all_adducts: List of adducts to take into account (list of strings) + #' @param scanmode: Scan mode, positive or negative (string) + #' @param look4: List of adducts to look for (list of strings) + #' @param noise_mz: All known noise peaks (matrix) + #' @param resol: Value for resolution (integer) + #' @param slope: Value for slope for mass correction (float) + #' @param incpt: Value for intercept for mass correction (float) + #' @param ppm_fixed: Value for distance between two values of mass (integer) + #' @param ppm_iso_fixed: Value for distance between two values of mass for isotope peaks (integer) + #' + #' @return final_outlist: peak group list with filled-in intensities (matrix) + + options(stringsAsFactors = FALSE) + metlin <- assi <- iso <- rep("", nrow(peakgroup_list)) + theormz <- nisos <- expint <- conf <- rep(0, nrow(peakgroup_list)) + + # add adducts to identification list + if (scanmode == "Positive") { + adduct_scanmode <- "+" + } else { + adduct_scanmode <- "-" + } + # make a copy of noise_mz + noise_mz_orig <- noise_mz + + # loop over type of adduct + for (adduct_index in 1:length(look4)) { + noise_mz_adduct <- noise_mz_orig + noise_mz_adduct[, "CompoundName"] <- as.character(noise_mz_orig[, "CompoundName"]) + + if (look4[adduct_index] == "H2O") { + add2label <- paste0("[M-", look4[adduct_index], "]", adduct_scanmode) + } else { + add2label <- paste0("[M+", look4[adduct_index], "]", adduct_scanmode) + } + + noise_mz_adduct[, "CompoundName"] <- paste0(noise_mz_adduct[, "CompoundName"], add2label) + adduct_info <- get_element_info(look4[adduct_index], all_adducts) + if (scanmode == "Positive") { + adduct_mass <- adduct_info$mass[1] + adduct_info$isotope$mass[1] - hydrogen_mass + } else { + adduct_mass <- adduct_info$mass[1] + adduct_info$isotope$mass[1] + hydrogen_mass + } + + # loop over compounds in database + for (compound_index in 1:nrow(noise_mz_adduct)) { + # construct information for compound + adduct: + if (scanmode == "Positive") { + noise_mz_adduct[compound_index, "Mpos"] <- as.numeric(noise_mz_adduct[compound_index, "Mpos"]) + adduct_mass + noise_mz_adduct[compound_index, "MNeg"] <- 0 + } else { + noise_mz_adduct[compound_index, "Mpos"] <- 0 + noise_mz_adduct[compound_index, "MNeg"] <- as.numeric(noise_mz_adduct[compound_index, "MNeg"]) + adduct_mass + } + } + noise_mz <- rbind(noise_mz, noise_mz_adduct) + } + + if (scanmode == "Positive") { + theor_mcol <- as.numeric(noise_mz[, "Mpos"]) + } else { + theor_mcol <- as.numeric(noise_mz[, "MNeg"]) + } + + # get mz information from peakgroup_list + mcol <- peakgroup_list[, "mzmed.pgrp"] + # if column with average intensities is missing, calculate it: + if (!("avg.int" %in% colnames(peakgroup_list))) { + mzmaxcol <- which(colnames(peakgroup_list) == "mzmax.pgrp") + endcol <- ncol(peakgroup_list) + peakgroup_list[, "avg.int"] <- apply(peakgroup_list[, (mzmaxcol + 1):(endcol)], 1, mean) + } + + # do indentification using own database: + for (row_index in 1:nrow(noise_mz)) { + theor_mz <- theor_mcol[row_index] + + # set tolerance for mz accuracy of main peak + mtol <- theor_mz * ppm_fixed / 1000000 + # find main peak + selp <- which(mcol > (theor_mz - mtol) & mcol < (theor_mz + mtol)) + # if there is more than one candidate peak for main, select best one based on mz_diff + if (length(selp) > 1) { + selp <- selp[abs(mcol[selp] - theor_mz) == min(abs(mcol[selp] - theor_mz))] + } + if (length(selp) == 1) { + assi[selp] <- paste(assi[selp], as.character(noise_mz[row_index, "CompoundName"]), sep = ";") + theormz[selp] <- theor_mz + } + } + + final_outlist <- cbind(peakgroup_list, assi, theormz, conf, nisos, iso, expint, metlin) + return(final_outlist) +} diff --git a/DIMS/Utils/is_diagnostic_patient.R b/DIMS/Utils/is_diagnostic_patient.R new file mode 100644 index 0000000..5820838 --- /dev/null +++ b/DIMS/Utils/is_diagnostic_patient.R @@ -0,0 +1,11 @@ +is_diagnostic_patient <- function(patient_column) { + #' Check for Diagnostics patients with correct patient number (e.g. starting with "P2024M") + #' + #' @param patient_column: a column from dataframe with IDs (character vector) + #' + #' @return: a logical vector with TRUE or FALSE for each element (vector) + + diagnostic_patients <- grepl("^P[0-9]{4}M", patient_column) + + return(diagnostic_patients) +} diff --git a/DIMS/Utils/merge_duplicate_rows.R b/DIMS/Utils/merge_duplicate_rows.R new file mode 100644 index 0000000..da3bc52 --- /dev/null +++ b/DIMS/Utils/merge_duplicate_rows.R @@ -0,0 +1,59 @@ +## adapted from mergeDuplicatedRows.R +merge_duplicate_rows <- function(peakgroup_list) { + #' Merge identification info for peak groups with the same mass + #' + #' @param peakgroup_list: Peak group list (matrix) + #' + #' @return peakgroup_list_dedup: de-duplicated peak group list (matrix) + + collapse <- function(column_label, peakgroup_list, index_dup) { + #' Collapse identification info for peak groups with the same mass + #' + #' @param column_label: Name of column in peakgroup_list (string) + #' @param peakgroup_list: Peak group list (matrix) + #' @param index_dup: Index of duplicate peak group (integer) + #' + #' @return collapsed_items: Semicolon-separated list of info (string) + # get the item(s) that need to be collapsed + list_items <- as.vector(peakgroup_list[index_dup, column_label]) + # remove NA + if (length(which(is.na(list_items))) > 0) list_items <- list_items[-which(is.na(list_items))] + collapsed_items <- paste(list_items, collapse = ";") + return(collapsed_items) + } + + options(digits = 16) + collect <- NULL + remove <- NULL + + # check for peak groups with identical mass + index_dup <- which(duplicated(peakgroup_list[, "mzmed.pgrp"])) + + while (length(index_dup) > 0) { + # get the index for the peak group which is double + peaklist_index <- which(peakgroup_list[, "mzmed.pgrp"] == peakgroup_list[index_dup[1], "mzmed.pgrp"]) + single_peakgroup <- peakgroup_list[peaklist_index[1], , drop = FALSE] + + # use function collapse to concatenate info + single_peakgroup[, "assi_HMDB"] <- collapse("assi_HMDB", peakgroup_list, peaklist_index) + single_peakgroup[, "iso_HMDB"] <- collapse("iso_HMDB", peakgroup_list, peaklist_index) + single_peakgroup[, "HMDB_code"] <- collapse("HMDB_code", peakgroup_list, peaklist_index) + single_peakgroup[, "assi_noise"] <- collapse("assi_noise", peakgroup_list, peaklist_index) + if (single_peakgroup[, "assi_noise"] == ";") single_peakgroup[, "assi_noise"] <- NA + single_peakgroup[, "theormz_noise"] <- collapse("theormz_noise", peakgroup_list, peaklist_index) + if (single_peakgroup[,"theormz_noise"] == "0;0") single_peakgroup[, "theormz_noise"] <- NA + + # keep track of deduplicated entries + collect <- rbind(collect, single_peakgroup) + remove <- c(remove, peaklist_index) + + # remove current entry from index + index_dup <- index_dup[-which(peakgroup_list[index_dup, "mzmed.pgrp"] == peakgroup_list[index_dup[1], "mzmed.pgrp"])] + } + + # remove duplicate entries + if (!is.null(remove)) peakgroup_list <- peakgroup_list[-remove, ] + # append deduplicated entries + peakgroup_list_dedup <- rbind(peakgroup_list, collect) + return(peakgroup_list_dedup) +} diff --git a/DIMS/Utils/optimize_gaussfit.R b/DIMS/Utils/optimize_gaussfit.R new file mode 100644 index 0000000..2d7f95d --- /dev/null +++ b/DIMS/Utils/optimize_gaussfit.R @@ -0,0 +1,23 @@ +## adapted from optimizeGauss.R +optimize_gaussfit <- function(mass_vector, int_vector, sigma, mass_max) { + #' Optimize fit of Gaussian curve to small region of m/z + #' + #' @param mass_vector: Vector of m/z values for a region of interest (float) + #' @param int_vector: Vector of intensities for a region of interest (float) + #' @param sigma: Value for standard deviation (float) + #' @param mass_max: Value for mass at center of peak (float) + #' + #' @return opt_fit: list of fit values for region of interest (list) + + # define optimization function for optim based on normal distribution + opt_f <- function(p, mass_vector, int_vector, sigma, mass_max) { + curve <- p * dnorm(mass_vector, mass_max, sigma) + return((max(curve) - max(int_vector))^2) + } + + # get optimal value for fitted Gaussian curve + opt_fit <- optimize(opt_f, c(0, 100000), tol = 0.0001, mass_vector, int_vector, sigma, mass_max) + + return(opt_fit$minimum) +} + diff --git a/DIMS/Utils/output_helix.R b/DIMS/Utils/output_helix.R new file mode 100644 index 0000000..09fa1a9 --- /dev/null +++ b/DIMS/Utils/output_helix.R @@ -0,0 +1,39 @@ +output_for_helix <- function(protocol_name, df_metabs_helix) { + #' Get the output dataframe for Helix + #' + #' @param protocol_name: protocol name (string) + #' @param df_metabs_helix: dataframe with metabolite Z-scores for patients (dataframe) + #' + #' @return: dataframe with patient metabolite Z-scores in correct format for Helix + + # Remove positive controls + df_metabs_helix <- df_metabs_helix %>% filter(is_diagnostic_patient(Patient)) + + # Add 'Vial' column, each patient has unique ID + df_metabs_helix <- df_metabs_helix %>% + group_by(Patient) %>% + mutate(Vial = cur_group_id()) %>% + ungroup() + + # Split patient number into labnummer and Onderzoeksnummer + df_metabs_helix <- add_lab_id_and_onderzoeksnummer(df_metabs_helix) + + # Add column with protocol name + df_metabs_helix$Protocol <- protocol_name + + # Change name Z_score and Helix_naam columns to Amount and Name + change_columns <- c(Amount = "Z_score", Name = "Helix_naam") + df_metabs_helix <- df_metabs_helix %>% rename(all_of(change_columns)) + + # Select only necessary columns and set them in correct order + df_metabs_helix <- df_metabs_helix %>% + select(c(Vial, labnummer, Onderzoeksnummer, Protocol, Name, Amount)) + + # Remove duplicate patient-metabolite combinations ("leucine + isoleucine + allo-isoleucin_Z-score" is added 3 times) + df_metabs_helix <- df_metabs_helix %>% + group_by(Onderzoeksnummer, Name) %>% + distinct() %>% + ungroup() + + return(df_metabs_helix) +} diff --git a/DIMS/Utils/prepare_alarmvalues.R b/DIMS/Utils/prepare_alarmvalues.R new file mode 100644 index 0000000..94ffbfd --- /dev/null +++ b/DIMS/Utils/prepare_alarmvalues.R @@ -0,0 +1,59 @@ +prepare_alarmvalues <- function(pt_name, dims_helix_table) { + #' Create a dataframe with all metabolites that exceed the min and max Z-score cutoffs + #' + #' @param pt_name: patient code (string) + #' @param dims_helix_table: dataframe with metabolite Z-scores for each patient and Helix info (dataframe) + #' + #' @return: dataframe with metabolites that exceed the min and max Z-score cutoffs for the selected patient + + # extract data for patient of interest (pt_name) + pt_metabs_helix <- dims_helix_table %>% filter(Patient == pt_name) + pt_metabs_helix$Z_score <- round(pt_metabs_helix$Z_score, 2) + + # Make empty dataframes for metabolites above or below alarmvalues + pt_list_high <- data.frame(HMDB_name = character(), Z_score = numeric()) + pt_list_low <- data.frame(HMDB_name = character(), Z_score = numeric()) + + # Loop over individual metabolites + for (metab in unique(pt_metabs_helix$HMDB_name)){ + # Get data for individual metabolite + pt_metab <- pt_metabs_helix %>% filter(HMDB_name == metab) + # print(pt_metab) + + # Check if zscore is positive of negative + if (pt_metab$Z_score > 0) { + # Get specific alarmvalue for metabolite + high_zscore_cutoff_metab <- pt_metabs_helix %>% filter(HMDB_name == metab) %>% pull(high_zscore) + + # If zscore is above the alarmvalue, add to pt_list_high table + if (pt_metab$Z_score > high_zscore_cutoff_metab) { + pt_metab_high <- pt_metab %>% select(HMDB_name, Z_score) + pt_list_high <- rbind(pt_list_high, pt_metab_high) + } + } else { + # Get specific alarmvalue for metabolite + low_zscore_cutoff_metab <- pt_metabs_helix %>% filter(HMDB_name == metab) %>% pull(low_zscore) + + # If zscore is below the alarmvalue, add to pt_list_low table + if (pt_metab$Z_score < low_zscore_cutoff_metab) { + pt_metab_low <- pt_metab %>% select(HMDB_name, Z_score) + pt_list_low <- rbind(pt_list_low, pt_metab_low) + } + } + } + + # sort tables on zscore + pt_list_high <- pt_list_high %>% arrange(desc(Z_score)) + pt_list_low <- pt_list_low %>% arrange(Z_score) + # add lines for increased, decreased + extra_line1 <- c("Increased", "") + extra_line2 <- c("Decreased", "") + # combine the two lists + top_metab_pt <- rbind(extra_line1, pt_list_high, extra_line2, pt_list_low) + # remove row names + rownames(top_metab_pt) <- NULL + # change column names for display + colnames(top_metab_pt) <- c("Metabolite", "Z-score") + + return(top_metab_pt) +} diff --git a/DIMS/Utils/prepare_data.R b/DIMS/Utils/prepare_data.R new file mode 100644 index 0000000..ea3efe5 --- /dev/null +++ b/DIMS/Utils/prepare_data.R @@ -0,0 +1,51 @@ +# unused variables will be removed: metab_list_alarm +prepare_data <- function(metab_list_all, zscore_patients_local) { + #' Combine patient Z-scores with metabolite info + #' + #' @param metab_list_all: list of dataframes with metabolite information for different stofgroepen (list) + #' @param zscore_patients_local: dataframe with metabolite Z-scores for all patient + #' + #' @return: list of dataframes for each stofgroep with data for each metabolite and patient/control per row + + # remove "_Zscore" from column (patient) names + colnames(zscore_patients_local) <- gsub("_Zscore", "", colnames(zscore_patients_local)) + # put data into pages, max 20 violin plots per page in PDF + metab_interest_sorted <- list() + metab_category <- c() + for (metab_class_index in 1:length(metab_list_all)) { + metab_class <- names(metab_list_all)[metab_class_index] + metab_list <- metab_list_all[[metab_class_index]] + if (ncol(metab_list) > 2) { + # third column are the alarm values, so reduce the data frame to 2 columns and save list + metab_list_alarm <- metab_list + metab_list <- metab_list[, c(1, 2)] + } + # make sure that all HMDB_names have 45 characters + for (metab_index in 1:length(metab_list$HMDB_name)) { + if (is.character(metab_list$HMDB_name[metab_index])) { + hmdb_name_separated <- strsplit(metab_list$HMDB_name[metab_index], "")[[1]] + } else { + hmdb_name_separated <- "strspliterror" + } + if (length(hmdb_name_separated) <= 45) { + hmdb_name_separated <- c(hmdb_name_separated, rep(" ", 45 - length(hmdb_name_separated))) + } else { + hmdb_name_separated <- c(hmdb_name_separated[1:42], "...") + } + metab_list$HMDB_name[metab_index] <- paste0(hmdb_name_separated, collapse = "") + } + # find metabolites and ratios in data frame zscore_patients_local + metab_interest <- inner_join(metab_list, zscore_patients_local[-2], by = "HMDB_code") + # remove column "HMDB_code" + metab_interest <- metab_interest[, -which(colnames(metab_interest) == "HMDB_code")] + # put the data frame in long format + metab_interest_melt <- reshape2::melt(metab_interest, id.vars = "HMDB_name") + # sort on metabolite names (HMDB_name) + sort_order <- order(metab_interest_melt$HMDB_name) + metab_interest_sorted[[metab_class_index]] <- metab_interest_melt[sort_order, ] + metab_category <- c(metab_category, metab_class) + } + names(metab_interest_sorted) <- metab_category + + return(metab_interest_sorted) +} diff --git a/DIMS/Utils/prepare_data_perpage.R b/DIMS/Utils/prepare_data_perpage.R new file mode 100644 index 0000000..61d2e15 --- /dev/null +++ b/DIMS/Utils/prepare_data_perpage.R @@ -0,0 +1,58 @@ +# remove default variable values +prepare_data_perpage <- function(metab_interest_sorted, metab_interest_contr, nr_plots_perpage, nr_pat = 20, nr_contr = 30) { + #' Combine patient and control data for each page of the violinplot pdf + #' + #' @param metab_interest_sorted: list of dataframes with data for each metabolite and patient (list) + #' @param metab_interest_contr: list of dataframes with data for each metabolite and control (list) + #' @param nr_plots_perpage: number of plots per page in the violinplot pdf (integer) + #' @param nr_pat: number of patients (integer) + #' @param nr_contr: number of controls (integer) + #' + #' @return: list of dataframes with metabolite Z-scores for each patient and control, + #' the length of list is the number of pages for the violinplot pdf (list) + + total_nr_pages <- 0 + metab_perpage <- list() + metab_category <- c() + for (metab_class_index in 1:length(metab_interest_sorted)) { + # split list into pages, each page containing max nr_plots_perpage (20) compounds + metab_interest_perclass <- metab_interest_sorted[[metab_class_index]] + metab_class <- names(metab_interest_sorted)[metab_class_index] + # add controls + metab_interest_contr_perclass <- metab_interest_contr[[metab_class_index]] + # number of pages for this class + nr_pages <- ceiling(length(unique(metab_interest_perclass$HMDB_name)) / nr_plots_perpage) + for (page_nr in 1:nr_pages) { + total_nr_pages <- total_nr_pages + 1 + select_rows_start <- (nr_pat * nr_plots_perpage * (page_nr - 1)) + 1 + select_rows_end <- nr_pat * nr_plots_perpage * page_nr + metab_onepage_pat <- metab_interest_perclass[select_rows_start:select_rows_end, ] + # same for controls + select_rows_start_contr <- (nr_contr * nr_plots_perpage * (page_nr - 1)) + 1 + select_rows_end_contr <- nr_contr * nr_plots_perpage * page_nr + metab_onepage_pcontr <- metab_interest_contr_perclass[select_rows_start_contr:select_rows_end_contr, ] + # add controls + metab_onepage <- rbind(metab_onepage_pat, metab_onepage_pcontr) + # if a page has fewer than nr_plots_perpage plots, fill page with empty plots + na_rows <- which(is.na(metab_onepage$HMDB_name)) + if (length(na_rows) > 0) { + # repeat the patient and control variables + metab_onepage$variable[na_rows] <- metab_onepage$variable[1:(nr_pat + nr_contr)] + # for HMDB name, substitute a number of spaces + for (row_nr in na_rows) { + metab_onepage$HMDB_name[row_nr] <- paste0(rep("_", ceiling(row_nr / (nr_pat + nr_contr))), collapse = "") + } + metab_onepage$HMDB_name <- gsub("_", " ", metab_onepage$HMDB_name) + # leave the values at NA + } + # put data for one page into object with data for all pages + metab_perpage[[total_nr_pages]] <- metab_onepage + # create list of page headers + metab_category <- c(metab_category, paste(metab_class, page_nr, sep = "_")) + } + } + # add page headers to list + names(metab_perpage) <- metab_category + + return(metab_perpage) +} diff --git a/DIMS/Utils/prepare_toplist.R b/DIMS/Utils/prepare_toplist.R new file mode 100644 index 0000000..cc616c4 --- /dev/null +++ b/DIMS/Utils/prepare_toplist.R @@ -0,0 +1,35 @@ +prepare_toplist <- function(pt_name, zscore_patients_copy) { + #' Create a dataframe with the top 20 highest and top 10 lowest metabolites per patient + #' + #' @param pt_name: patient code (string) + #' @param zscore_patients_copy: dataframe with metabolite Z-scores per patient (dataframe) + #' + #' @return: dataframe with 30 metabolites and Z-scores (dataframe) + + # set parameters for table + top_highest <- 20 + top_lowest <- 10 + + # extract data for patient of interest (pt_name) + pt_list <- zscore_patients_copy[, c(1, 2, which(colnames(zscore_patients_copy) == pt_name))] + # sort metabolites on Z-scores for this patient + pt_list_sort <- sort(pt_list[, 3], index.return = TRUE) + # determine top highest and lowest Z-scores for this patient + pt_list_sort <- sort(pt_list[, 3], index.return = TRUE) + pt_list_low <- pt_list[pt_list_sort$ix[1:top_lowest], ] + pt_list_high <- pt_list[pt_list_sort$ix[length(pt_list_sort$ix):(length(pt_list_sort$ix) - top_highest + 1)], ] + # round off Z-scores + pt_list_low[, 3] <- round(as.numeric(pt_list_low[, 3]), 2) + pt_list_high[, 3] <- round(as.numeric(pt_list_high[, 3]), 2) + # add lines for increased, decreased + extra_line1 <- c("Increased", "", "") + extra_line2 <- c("Decreased", "", "") + top_metab_pt <- rbind(extra_line1, pt_list_high, extra_line2, pt_list_low) + # remove row names + rownames(top_metab_pt) <- NULL + + # change column names for display + colnames(top_metab_pt) <- c("HMDB_ID", "Metabolite", "Z-score") + + return(top_metab_pt) +} diff --git a/DIMS/Utils/replace_zeros.R b/DIMS/Utils/replace_zeros.R new file mode 100644 index 0000000..0ef06a3 --- /dev/null +++ b/DIMS/Utils/replace_zeros.R @@ -0,0 +1,64 @@ +## adapted from replaceZeros.R +# this function does two things: replace zeros with random value and identify noise peaks +# refactor: split into two functions +# remove parameters outdir and thresh +# make hard-coded path to file with noise peaks into variable +# remove variables outdir, thresh +replace_zeros <- function(peakgroup_list, repl_pattern, scanmode, resol, outdir, thresh, ppm) { + #' Replace intensities that are zero with random value + #' + #' @param peakgroup_list: Peak group list (matrix) + #' @param repl_pattern: Replication pattern (list of strings) + #' @param scanmode: Scan mode, positive or negative (string) + #' @param resol: Value for resolution (integer) + #' @param outdir: Path for output directory (string) + #' @param thresh: Value for threshold (integer) + #' @param ppm: Value for distance between two values of mass (integer) + #' + #' @return final_outlist: peak group list with filled-in intensities (matrix) + + # replace zeros + if (!is.null(peakgroup_list)) { + for (sample_index in 1:length(names(repl_pattern))) { + sample_peaks <- peakgroup_list[, names(repl_pattern)[sample_index]] + zero_intensity <- which(sample_peaks <= 0) + if (!length(zero_intensity)) { + next + } + for (zero_index in 1:length(zero_intensity)) { + area <- fit_optim(peakgroup_list[zero_intensity[zero_index], "mzmed.pgrp"], thresh, + resol, FALSE, scanmode, int_factor = 1 * 10^5, 1, 1)$area + peakgroup_list[zero_intensity[zero_index], names(repl_pattern)[sample_index]] <- rnorm(n = 1, mean = area, + sd = 0.25 * area) + } + } + + # Add column with average intensity + peakgroup_list <- cbind(peakgroup_list, "avg.int" = apply(peakgroup_list[, 7:(ncol(peakgroup_list) - 4)], 1, mean)) + + if (scanmode == "negative") { + label <- "MNeg" + label2 <- "Negative" + # look for adducts in negative mode + look4_adducts <- c("Cl", "Cl37", "For", "NaCl", "KCl", "H2PO4", "HSO4", "Na-H", "K-H", "H2O", "I") + } else { + label <- "Mpos" + label2 <- "Positive" + # look for adducts in positive mode + look4_adducts <- c("Na", "K", "NaCl", "NH4", "2Na-H", "CH3OH", "KCl", "NaK-H") + } + + # Identify noise peaks + noise_mz <- read.table(file = "/hpc/dbg_mz/tools/db/TheoreticalMZ_NegPos_incNaCl.txt", + sep = "\t", header = TRUE, quote = "") + noise_mz <- noise_mz[(noise_mz[, label] != 0), 1:4] + outlist_withnoise <- identify_noisepeaks(peakgroup_list, all_adducts, scanmode = label2, + noise_mz, look4 = look4_adducts, resol = resol, + slope = 0, incpt = 0, ppm_fixed = ppm, ppm_iso_fixed = ppm) + noise_info <- outlist_withnoise[, c("assi", "theormz")] + colnames(noise_info) <- c("assi_noise", "theormz_noise") + + final_outlist <- cbind(peakgroup_list, noise_info) + return(final_outlist) + } +} diff --git a/DIMS/Utils/search_mzrange.R b/DIMS/Utils/search_mzrange.R new file mode 100644 index 0000000..dcdc42a --- /dev/null +++ b/DIMS/Utils/search_mzrange.R @@ -0,0 +1,180 @@ +## adapted from searchMZRange.R +# variables with fixed values will be removed from function parameters +# int_factor, scale, outdir, plot, thresh, width, height +# allpeaks_values should be generated here, not passed on from do_peakfinding +search_mzrange <- function(ints_fullrange, allpeaks_values, int_factor, scale, resol, + outdir, sample_name, scanmode, plot, width, height, thresh) { + #' Divide the full m/z range into regions of interest with min, max and mean m/z + #' + #' @param ints_fullrange: Named list of intensities (float) + #' @param allpeaks_values: Empty list to store results for all peaks + #' @param int_factor: Value used to calculate area under Gaussian curve (integer) + #' @param scale: Initial value used to estimate scaling parameter (integer) + #' @param resol: Value for resolution (integer) + #' @param outdir: Path for output directory (string) + #' @param sample_name: Sample name (string) + #' @param scanmode: Scan mode, positive or negative (string) + #' @param plot: Parameter indicating whether plots should be made (boolean) + #' @param width: Value for width of plot (integer) + #' @param height: Value for height of plot (integer) + #' @param thresh: Value for noise level threshold (integer) + #' + #' @return allpeaks_values: list of m/z regions of interest + + # find indices where intensity is not equal to zero + nonzero_indices <- as.vector(which(ints_fullrange != 0)) + + # bad infusion. These should have been taken out in AverageTechReplicates + if (length(nonzero_indices) == 0) return(allpeaks_values) + + # initialize + end_index <- NULL + start_index <- nonzero_indices[1] + # maximum length of region of interest + max_roi_length <- 15 + + # find regions of interest + for (mz_index in 1:length(nonzero_indices)) { + # check whether mz_index is smaller than length(nonzero_indices). + # only false if mz_index == length(nonzero_indixes). + # second check is true at the end of a peak. + if (mz_index < length(nonzero_indices) && (nonzero_indices[mz_index + 1] - nonzero_indices[mz_index]) > 1) { + end_index <- nonzero_indices[mz_index] + # get m/z values and intensities for this region of interest + mass_vector <- as.numeric(names(ints_fullrange)[c(start_index:end_index)]) + int_vector <- as.vector(ints_fullrange[c(start_index:end_index)]) + # check whether the vector of intensities is not empty. + if (length(int_vector) != 0) { + # check if intensity is above threshold or the maximum intensity is NaN + if (max(int_vector) < thresh || is.nan(max(int_vector))) { + # go to next region of interest + start_index <- nonzero_indices[mz_index + 1] + next + } + # check if there are more intensities than maximum for region of interest + if (length(int_vector) > max_roi_length) { + # trim lowest intensities to zero + int_vector[which(int_vector < min(int_vector) * 1.1)] <- 0 + # split the range into multiple sub ranges + sub_range <- int_vector + names(sub_range) <- mass_vector + allpeaks_values <- search_mzrange(sub_range, allpeaks_values, int_factor, + scale, resol, outdir, sample_name, scanmode, + plot, width, height, thresh) + # A proper peak needs to have at least 3 intensities above threshold + } else if (length(int_vector) > 3) { + # check if the sum of intensities is above zero. Why is this necessary? + if (sum(int_vector) == 0) next + # get initial fit values + roi_values <- fit_init(mass_vector, int_vector, int_factor, scale, resol, + outdir, sample_name, scanmode, plot, width, height, + mz_index, start_index, end_index) + print(roi_values) + if (roi_values$qual[1] == 1) { + # get optimized fit values + roi_values <- fit_optim(mass_vector, int_vector, resol, plot, + scanmode, int_factor, width, height) + # add region of interest to list of all peaks + allpeaks_values$mean <- c(allpeaks_values$mean, roi_values$mean) + allpeaks_values$area <- c(allpeaks_values$area, roi_values$area) + allpeaks_values$nr <- c(allpeaks_values$nr, sample_name) + allpeaks_values$min <- c(allpeaks_values$min, roi_values$min) + allpeaks_values$max <- c(allpeaks_values$max, roi_values$max) + allpeaks_values$qual <- c(allpeaks_values$qual, 0) + allpeaks_values$spikes <- allpeaks_values$spikes + 1 + + } else { + for (j in 1:length(roi_values$mean)){ + allpeaks_values$mean <- c(allpeaks_values$mean, roi_values$mean[j]) + allpeaks_values$area <- c(allpeaks_values$area, roi_values$area[j]) + allpeaks_values$nr <- c(allpeaks_values$nr, sample_name) + allpeaks_values$min <- c(allpeaks_values$min, roi_values$min[1]) + allpeaks_values$max <- c(allpeaks_values$max, roi_values$max[1]) + allpeaks_values$qual <- c(allpeaks_values$qual, roi_values$qual[1]) + } + } + + } else { + + roi_values <- fit_optim(mass_vector, int_vector, resol, + plot, scanmode, int_factor, width, height) + allpeaks_values$mean <- c(allpeaks_values$mean, roi_values$mean) + allpeaks_values$area <- c(allpeaks_values$area, roi_values$area) + allpeaks_values$nr <- c(allpeaks_values$nr, sample_name) + allpeaks_values$min <- c(allpeaks_values$min, roi_values$min) + allpeaks_values$max <- c(allpeaks_values$max, roi_values$max) + allpeaks_values$qual <- c(allpeaks_values$qual, 0) + allpeaks_values$spikes <- allpeaks_values$spikes + 1 + } + } + start_index <- nonzero_indices[mz_index + 1] + } + } + + # last little range + end_index <- nonzero_indices[length(nonzero_indices)] + mass_vector <- as.numeric(names(ints_fullrange)[c(start_index:end_index)]) + int_vector <- as.vector(ints_fullrange[c(start_index:end_index)]) + + if (length(int_vector) != 0) { + # check if intensity above threshold + if (max(int_vector) < thresh || is.nan(max(int_vector))) { + # do nothing + } else { + # check if there are more intensities than maximum for region of interest + if (length(int_vector) > max_roi_length) { + # trim lowest intensities to zero + int_vector[which(int_vector < min(int_vector) * 1.1)] <- 0 + # split the range into multiple sub ranges + sub_range <- int_vector + names(sub_range) <- mass_vector + + allpeaks_values <- search_mzrange(sub_range, allpeaks_values, int_factor, scale, resol, + outdir, sample_name, scanmode, + plot, width, height, thresh) + + } else if (length(int_vector) > 3) { + # Check only zeros + if (sum(int_vector) == 0) next + + roi_values <- fit_init(mass_vector, int_vector, int_factor, scale, resol, + outdir, sample_name, scanmode, plot, width, height, + mz_index, start_index, end_index) + if (roi_values$qual[1] == 1) { + roi_values <- fit_optim(mass_vector, int_vector, resol, + plot, scanmode, int_factor, width, height) + + allpeaks_values$mean <- c(allpeaks_values$mean, roi_values$mean) + allpeaks_values$area <- c(allpeaks_values$area, roi_values$area) + allpeaks_values$nr <- c(allpeaks_values$nr, sample_name) + allpeaks_values$min <- c(allpeaks_values$min, roi_values$min) + allpeaks_values$max <- c(allpeaks_values$max, roi_values$max) + allpeaks_values$qual <- c(allpeaks_values$qual, 0) + allpeaks_values$spikes <- allpeaks_values$spikes + 1 + + } else { + for (j in 1:length(roi_values$mean)){ + allpeaks_values$mean <- c(allpeaks_values$mean, roi_values$mean[j]) + allpeaks_values$area <- c(allpeaks_values$area, roi_values$area[j]) + allpeaks_values$nr <- c(allpeaks_values$nr, sample_name) + allpeaks_values$min <- c(allpeaks_values$min, roi_values$min[1]) + allpeaks_values$max <- c(allpeaks_values$max, roi_values$max[1]) + allpeaks_values$qual <- c(allpeaks_values$qual, roi_values$qual[1]) + } + } + } else { + roi_values <- fit_optim(mass_vector, int_vector, resol, + plot, scanmode, int_factor, width, height) + allpeaks_values$mean <- c(allpeaks_values$mean, roi_values$mean) + allpeaks_values$area <- c(allpeaks_values$area, roi_values$area) + allpeaks_values$nr <- c(allpeaks_values$nr, sample_name) + allpeaks_values$min <- c(allpeaks_values$min, roi_values$min) + allpeaks_values$max <- c(allpeaks_values$max, roi_values$max) + allpeaks_values$qual <- c(allpeaks_values$qual, 0) + allpeaks_values$spikes <- allpeaks_values$spikes + 1 + } + } + } + return(allpeaks_values) +} + diff --git a/DIMS/Utils/sum_curves.R b/DIMS/Utils/sum_curves.R new file mode 100644 index 0000000..542026b --- /dev/null +++ b/DIMS/Utils/sum_curves.R @@ -0,0 +1,35 @@ +## adapted from sumCurves.R +# variables with fixed values will be removed from function parameters +# plot +# parameter half_max not used +sum_curves <- function(mean1, mean2, scale1, scale2, sigma1, sigma2, mass_vector2, mass_vector, resol, plot) { + #' Sum two curves + #' + #' @param mean1: Value for mean m/z of first peak (float) + #' @param mean2: Value for mean m/z of second peak (float) + #' @param scale1: Initial value used to estimate scaling parameter for first peak (integer) + #' @param scale2: Initial value used to estimate scaling parameter for second peak (integer) + #' @param sigma1: Value for standard deviation for first peak (float) + #' @param sigma2: Value for standard deviation for second peak (float) + #' @param mass_vector2: Vector of equally spaced m/z values (float) + #' @param mass_vector: Vector of m/z values for a region of interest (float) + #' @param resol: Value for resolution (integer) + #' @param plot: Parameter indicating whether plots should be made (boolean) + #' + #' @return list_params: list of parameters indicating quality of fit (list) + + sum_fit <- (scale1 * dnorm(mass_vector2, mean1, sigma1)) + (scale2 * dnorm(mass_vector2, mean2, sigma2)) + + mean1_plus2 <- weighted.mean(c(mean1, mean2), c(max(scale1 * dnorm(mass_vector2, mean1, sigma1)), + max(scale2 * dnorm(mass_vector2, mean2, sigma2)))) + + # get new values for parameters + fwhm <- get_fwhm(mean1_plus2, resol) + area <- max(sum_fit) + scale <- scale1 + scale2 + sigma <- (fwhm / 2) * 0.85 + + list_params <- list("mean" = mean1_plus2, "area" = area, "scale" = scale, "sigma" = sigma) + return(list_params) +} + diff --git a/DIMS/Utils/within_ppm.R b/DIMS/Utils/within_ppm.R new file mode 100644 index 0000000..abdb0d4 --- /dev/null +++ b/DIMS/Utils/within_ppm.R @@ -0,0 +1,64 @@ +## adapted from isWithinXppm.R +# variables with fixed values will be removed from function parameters +# plot +within_ppm <- function(mean, scale, sigma, area, mass_vector2, mass_vector, ppm = 4, resol, plot) { + #' Test whether two mass ranges are within ppm distance of each other + #' + #' @param mean: Value for mean m/z (float) + #' @param scale: Initial value used to estimate scaling parameter (integer) + #' @param sigma: Value for standard deviation (float) + #' @param area: Value for area under the curve (float) + #' @param mass_vector2: Vector of equally spaced m/z values (float) + #' @param mass_vector: Vector of m/z values for a region of interest (float) + #' @param ppm: Value for distance between two values of mass (integer) + #' @param resol: Value for resolution (integer) + #' @param plot: Parameter indicating whether plots should be made (boolean) + #' + #' @return list_params: list of parameters indicating quality of fit (list) + + # sort + index <- order(mean) + mean <- mean[index] + scale <- scale[index] + sigma <- sigma[index] + area <- area[index] + + summed <- NULL + remove <- NULL + + if (length(mean) > 1) { + for (i in 2:length(mean)) { + if ((abs(mean[i - 1] - mean[i]) / mean[i - 1]) * 10^6 < ppm) { + + # avoid double occurance in sum + if ((i - 1) %in% summed) next + + result_values <- sum_curves(mean[i - 1], mean[i], scale[i - 1], scale[i], sigma[i - 1], sigma[i], + mass_vector2, mass_vector, resol, plot) + summed <- c(summed, i - 1, i) + if (is.nan(result_values$mean)) result_values$mean <- 0 + mean[i - 1] <- result_values$mean + mean[i] <- result_values$mean + area[i - 1] <- result_values$area + area[i] <- result_values$area + scale[i - 1] <- result_values$scale + scale[i] <- result_values$scale + sigma[i - 1] <- result_values$sigma + sigma[i] <- result_values$sigma + + remove <- c(remove, i) + } + } + } + + if (length(remove) != 0) { + mean <- mean[-c(remove)] + area <- area[-c(remove)] + scale <- scale[-c(remove)] + sigma <- sigma[-c(remove)] + } + + list_params <- list("mean" = mean, "area" = area, "scale" = scale, "sigma" = sigma, "qual" = NULL) + return(list_params) +} +