Merge pull request #36 from UMCUGenetics/release/v2.2.0

Release/v2.2.0 (DIMS)
UMCUGenetics · Jun 18, 2024 · 972a97e · 972a97e
2 parents 2fa7abc + 9e114c4
commit 972a97e
Show file tree

Hide file tree

Showing 125 changed files with 8,333 additions and 0 deletions.
diff --git a/DIMS/AddOnFunctions/10-collectSamplesFilled_extraoutput.R b/DIMS/AddOnFunctions/10-collectSamplesFilled_extraoutput.R
@@ -0,0 +1,94 @@
+#!/usr/bin/Rscript
+
+.libPaths(new = "/hpc/local/CentOS7/dbg_mz/R_libs/3.2.2")
+
+# load required packages 
+# none 
+
+# define parameters 
+cmd_args <- commandArgs(trailingOnly = TRUE)
+for (arg in cmd_args) cat("  ", arg, "\n")
+
+outdir <- cmd_args[1]
+scanmode <- cmd_args[2]
+normalization <- cmd_args[3]
+scripts <- cmd_args[4]
+z_score <- as.numeric(cmd_args[5])
+ppm <- as.numeric(cmd_args[6])
+
+#outdir <- "/Users/nunen/Documents/Metab/processed/test_old"
+#scanmode <- "negative"
+#normalization <- "disabled"
+#scripts <- "/Users/nunen/Documents/Metab/DIMS/scripts"
+#db <- "/Users/nunen/Documents/Metab/DIMS/db/HMDB_add_iso_corrNaCl_withIS_withC5OH.RData"
+#z_score <- 0
+
+object.files = list.files(paste(outdir, "9-samplePeaksFilled", sep="/"), full.names=TRUE, pattern=scanmode)
+outlist.tot=NULL
+for (i in 1:length(object.files)) {
+  load(object.files[i])
+  print(print(object.files[i]))
+  outlist.tot = rbind(outlist.tot, final.outlist.idpat3)
+}
+
+source(paste(scripts, "AddOnFunctions/sourceDir.R", sep="/"))
+sourceDir(paste(scripts, "AddOnFunctions", sep="/"))
+
+# remove duplicates
+outlist.tot = mergeDuplicatedRows(outlist.tot)
+
+# sort on mass
+outlist.tot = outlist.tot[order(outlist.tot[,"mzmed.pgrp"]),]
+
+# normalization
+load(paste0(outdir, "/repl.pattern.",scanmode,".RData"))
+
+if (normalization != "disabled") {
+  outlist.tot = normalization_2.1(outlist.tot, fileName, names(repl.pattern.filtered), on=normalization, assi_label="assi_HMDB")
+}
+
+if (z_score == 1) {
+  outlist.stats = statistics_z(outlist.tot, sortCol=NULL, adducts=FALSE)
+  nr.removed.samples=length(which(repl.pattern.filtered[]=="character(0)"))
+  order.index.int=order(colnames(outlist.stats)[8:(length(repl.pattern.filtered)-nr.removed.samples+7)])
+  outlist.stats.more = cbind(outlist.stats[,1:7],
+                             outlist.stats[,(length(repl.pattern.filtered)-nr.removed.samples+8):(length(repl.pattern.filtered)-nr.removed.samples+8+6)],
+                             outlist.stats[,8:(length(repl.pattern.filtered)-nr.removed.samples+7)][order.index.int],
+                             outlist.stats[,(length(repl.pattern.filtered)-nr.removed.samples+5+10):ncol(outlist.stats)])
+
+  tmp.index=grep("_Zscore", colnames(outlist.stats.more), fixed = TRUE)
+  tmp.index.order=order(colnames(outlist.stats.more[,tmp.index]))
+  tmp = outlist.stats.more[,tmp.index[tmp.index.order]]
+  outlist.stats.more=outlist.stats.more[,-tmp.index]
+  outlist.stats.more=cbind(outlist.stats.more,tmp)
+  outlist.tot = outlist.stats.more
+}
+
+# filter identified compounds
+index.1=which((outlist.tot[,"assi_HMDB"]!="") & (!is.na(outlist.tot[,"assi_HMDB"])))
+index.2=which((outlist.tot[,"iso_HMDB"]!="") & (!is.na(outlist.tot[,"iso_HMDB"])))
+index=union(index.1,index.2)
+outlist.ident = outlist.tot[index,]
+outlist.not.ident = outlist.tot[-index,]
+
+if (z_score == 1) {
+  outlist.ident$ppmdev=as.numeric(outlist.ident$ppmdev)
+  outlist.ident <- outlist.ident[which(outlist.ident["ppmdev"] >= -ppm & outlist.ident["ppmdev"] <= ppm),]
+}
+# NAs in theormz_noise <======================================================================= uitzoeken!!!
+outlist.ident$theormz_noise[which(is.na(outlist.ident$theormz_noise))] = 0
+outlist.ident$theormz_noise=as.numeric(outlist.ident$theormz_noise)
+outlist.ident$theormz_noise[which(is.na(outlist.ident$theormz_noise))] = 0
+outlist.ident$theormz_noise=as.numeric(outlist.ident$theormz_noise)
+
+save(outlist.not.ident, outlist.ident, file=paste(outdir, "/outlist_identified_", scanmode, ".RData", sep=""))
+
+# Extra output in Excel-readable format:
+remove_columns <- c("fq.best", "fq.worst", "mzmin.pgrp", "mzmax.pgrp")
+remove_colindex <- which(colnames(outlist.ident) %in% remove_columns)
+outlist.ident <- outlist.ident[ , -remove_colindex]
+write.table(outlist.ident, file=paste0(outdir, "/outlist_identified_", scanmode, ".txt"), sep="\t", row.names = FALSE)
+remove_colindex <- which(colnames(outlist.not.ident) %in% remove_columns)
+outlist.not.ident <- outlist.not.ident[ , -remove_colindex]
+write.table(outlist.not.ident, file=paste0(outdir, "/outlist_not_identified_", scanmode, ".txt"), sep="\t", row.names = FALSE)
+
diff --git a/DIMS/AddOnFunctions/add_lab_id_and_onderzoeksnummer.R b/DIMS/AddOnFunctions/add_lab_id_and_onderzoeksnummer.R
@@ -0,0 +1,10 @@
+add_lab_id_and_onderzoeksnummer <- function(df_metabs_helix) {
+  # Split patient number into labnummer and Onderzoeksnummer
+  for (row in 1:nrow(df_metabs_helix)) {
+    df_metabs_helix[row,"labnummer"] <- gsub("^P|\\.[0-9]*", "", df_metabs_helix[row,"Patient"])
+    labnummer_split <- strsplit(as.character(df_metabs_helix[row, "labnummer"]), "M")[[1]]
+    df_metabs_helix[row, "Onderzoeksnummer"] <- paste0("MB", labnummer_split[1], "/", labnummer_split[2])
+  }
+
+  return(df_metabs_helix)
+}
diff --git a/DIMS/AddOnFunctions/checkOverlap.R b/DIMS/AddOnFunctions/checkOverlap.R
@@ -0,0 +1,16 @@
+checkOverlap <- function(range1,range2){
+  if (length(intersect(range1,range2))==2) {
+    # Overlap
+    # message("Overlap, smaller range is used")
+    if (length(range1) >= length(range2)){
+      range1=range1[-length(range1)]  
+    } else {
+      range2=range2[-1]
+    }
+  } else if (length(intersect(range1,range2))==3){
+    # message("Overlap, smaller range is used")
+    range1=range1[-length(range1)]  
+    range2=range2[-1]
+  }
+  return(list("range1"=range1,"range2"=range2))
+}
diff --git a/DIMS/AddOnFunctions/check_same_samplename.R b/DIMS/AddOnFunctions/check_same_samplename.R
@@ -0,0 +1,4 @@
+# function to test whether intensity and Z-score columns match
+check_same_samplename <- function(int_col_name, zscore_col_name) {
+  paste0(int_col_name, "_Zscore") == zscore_col_name
+}
diff --git a/DIMS/AddOnFunctions/create_violin_plots.R b/DIMS/AddOnFunctions/create_violin_plots.R
@@ -0,0 +1,116 @@
+create_violin_plots <- function(pdf_dir, pt_name, metab_perpage, top_metab_pt=NULL) {
+
+  # set parameters for plots
+  plot_height <- 9.6 
+  plot_width <- 6
+  fontsize <- 1 
+  nr_plots_perpage <- 20
+  circlesize <- 0.8 
+  colors_4plot <- c("#22E4AC", "#00B0F0", "#504FFF","#A704FD","#F36265","#DA0641")
+  #                   green     blue      blue/purple purple    orange    red
+
+  # patient plots, create the PDF device
+  pt_name_sub <- pt_name
+  suffix <- ""
+  if (grepl("Diagnostics", pdf_dir) & is_diagnostic_patient(pt_name)) {
+    prefix <- "MB"
+    suffix <- "_DIMS_PL_DIAG"
+    # substitute P and M in P2020M00001 into right format for Helix
+    pt_name_sub <- gsub("[PM]", "", pt_name)
+    pt_name_sub <- gsub("\\..*", "", pt_name_sub)
+  } else if (grepl("Diagnostics", pdf_dir)) {
+    prefix <- "Dx_"
+  } else if (grepl("IEM", pdf_dir)) {
+    prefix <- "IEM_"
+  } else {
+    prefix <- "R_"
+  }
+
+  pdf(paste0(pdf_dir, "/", prefix, pt_name_sub, suffix, ".pdf"),
+      onefile = TRUE,
+      width = plot_width,
+      height = plot_height)
+
+  # page headers:
+  page_headers <- names(metab_perpage)
+
+  # put table into PDF file, if not empty
+  if (!is.null(dim(top_metab_pt))) {
+    plot.new()
+    # get the names and numbers in the table aligned
+    table_theme <- ttheme_default(core = list(fg_params = list(hjust=0, x=0.05, fontsize=6)),
+                                  colhead = list(fg_params = list(fontsize=8, fontface="bold")))
+    grid.table(top_metab_pt, theme = table_theme, rows = NULL)
+    # g <- tableGrob(top_metab_pt)
+    # grid.draw(g)
+    text(x=0.45, y=1.02, paste0("Top deviating metabolites for patient: ", pt_name), font=1, cex=1)
+  }
+
+  # violin plots
+  for (page_index in 1:length(metab_perpage)) {
+    # extract list of metabolites to plot on a page
+    metab_list_2plot <- metab_perpage[[page_index]]
+    # extract original data for patient of interest (pt_name) before cut-offs
+    pt_list_2plot_orig <- metab_list_2plot[which(metab_list_2plot$variable == pt_name), ]
+    # cut off Z-scores higher than 20 or lower than -5 (for nicer plots)
+    metab_list_2plot$value[metab_list_2plot$value >  20] <-  20
+    metab_list_2plot$value[metab_list_2plot$value <  -5] <-  -5
+    # extract data for patient of interest (pt_name)
+    pt_list_2plot <- metab_list_2plot[which(metab_list_2plot$variable == pt_name), ]
+    # restore original Z-score before cut-off, for showing Z-scores in PDF
+    pt_list_2plot$value_orig <- pt_list_2plot_orig$value
+    # remove patient of interest (pt_name) from list; violins will be made up of controls and other patients
+    metab_list_2plot <- metab_list_2plot[-which(metab_list_2plot$variable == pt_name), ]
+    # subtitle per page
+    sub_perpage <- gsub("_", " ", page_headers[page_index])
+    # for IEM plots, put subtitle on two lines
+    sub_perpage <- gsub("probability", "\nprobability", sub_perpage)
+    # add size parameter for showing Z-score of patient per metabolite
+    Z_size <- rep(3, nrow(pt_list_2plot))
+    # set size to 0 if row is empty
+    Z_size[is.na(pt_list_2plot$value)] <- 0
+
+    # draw violin plot. shape=22 gives square for patient of interest
+    ggplot_object <- ggplot(metab_list_2plot, aes(x=value, y=HMDB_name)) +
+      theme(axis.text.y=element_text(size=rel(fontsize)), plot.caption = element_text(size=rel(fontsize))) +
+      xlim(-5, 20) +
+      geom_violin(scale="width") +
+      geom_point(data = pt_list_2plot, aes(color=value), size = 3.5*circlesize, shape=22, fill="white") +
+      scale_fill_gradientn(colors = colors_4plot, values = NULL, space = "Lab", na.value = "grey50", guide = "colourbar", aesthetics = "colour") +
+      # add Z-score value for patient of interest at x=16
+      geom_text(data = pt_list_2plot, aes(16, label = paste0("Z=", round(value_orig, 2))), hjust = "left", vjust = +0.2, size = Z_size) +
+      # add labels. Use font Courier to get all the plots in the same location.
+      labs(x = "Z-scores", y = "Metabolites", subtitle = sub_perpage, color = "z-score") + 
+      theme(axis.text.y = element_text(family = "Courier", size=6)) +
+      # do not show legend
+      theme(legend.position="none") +
+      # add title 
+      ggtitle(label = paste0("Results for patient ", pt_name)) + 
+      # labs(x = "Z-scores", y = "Metabolites", title = paste0("Results for patient ", pt_name), subtitle = sub_perpage, color = "z-score") + 
+      # add vertical lines
+      geom_vline(xintercept = 2, col = "grey", lwd = 0.5, lty=2) +
+      geom_vline(xintercept = -2, col = "grey", lwd = 0.5, lty=2)
+
+    suppressWarnings(print(ggplot_object))
+
+  }
+
+  # add explanation of violin plots, version number etc.
+  # plot.new()
+  plot(NA, xlim=c(0,5), ylim=c(0,5), bty='n', xaxt='n', yaxt='n', xlab='', ylab='')
+  if (length(explanation) > 0) {
+    text(0.2, 5, explanation[1], pos=4, cex=0.8)
+    for (line_index in 2:length(explanation)) {
+      text_y_position <- 5 - (line_index*0.2)
+      text(-0.2, text_y_position, explanation[line_index], pos=4, cex=0.5)
+    }
+    # full_explanation <- paste(explanation[2:length(explanation)], sep="  \n")
+    # text(0.2, 4, full_explanation, pos=4, cex=0.6)
+    #explanation_grob=textGrob(apply(full_explanation, 2, paste, collapse="\n"))
+    #grid.arrange(explanation_grob)
+  }
+
+  # close the PDF file
+  dev.off()
+
+}
diff --git a/DIMS/AddOnFunctions/elementInfo.R b/DIMS/AddOnFunctions/elementInfo.R
@@ -0,0 +1,6 @@
+elementInfo <- function(name, elements = NULL) { # from Rdisop function .getElement  
+  if (!is.list(elements) || length(elements)==0 ) {
+    elements <- initializePSE() }
+  if (name=="CH3OH+H"){rex<-"^CH3OH\\+H$"}else{rex <- paste ("^",name,"$", sep="")}
+  elements [[grep (rex, sapply (elements, function(x) {x$name}))]]
+}
diff --git a/DIMS/AddOnFunctions/export.R b/DIMS/AddOnFunctions/export.R
@@ -0,0 +1,13 @@
+export <- function(peaklist, plotdir, adducts, control_label, case_label, patients, sub, fileName){
+  # peaklist = outlist.adducts
+  # adducts=TRUE
+  # control_label="C"
+  # case_label="P"
+  # patients = getPatients(outlist.adducts)
+  # sub=3000
+
+  # peaklist = statistics_z_4export(as.data.frame(peaklist), plotdir, patients, adducts, control_label, case_label)
+
+  # generateExcelFile(peaklist, file.path(plotdir), imageNum=2, fileName, subName=c("","_box"), sub, adducts)
+
+}
diff --git a/DIMS/AddOnFunctions/findPeaks.Gauss.HPC.R b/DIMS/AddOnFunctions/findPeaks.Gauss.HPC.R
@@ -0,0 +1,24 @@
+### fit Gaussian estimate mean and integrate to obtain intensity
+findPeaks.Gauss.HPC <- function(plist, breaks.fwhm, int.factor, scale, resol, outdir, scanmode, plot, thresh, width, height) {
+  sampname <- colnames(plist)[1]
+
+  range <- as.vector(plist)
+  names(range) <- rownames(plist)
+
+  values <- list("mean"=NULL, "area"=NULL, "nr"=NULL, "min"=NULL, "max"=NULL, "qual"=NULL, "spikes"=0)
+
+  values <- searchMZRange(range, values, int.factor, scale, resol, outdir, sampname, scanmode, plot, width, height, thresh)
+
+  outlist.persample <- NULL
+  outlist.persample <- cbind("samplenr"=values$nr, "mzmed.pkt"=values$mean, "fq"=values$qual, "mzmin.pkt"=values$min, "mzmax.pkt"=values$max, "height.pkt"=values$area)
+
+  index <- which(outlist.persample[ ,"height.pkt"]==0)
+  if (length(index) > 0) {
+    outlist.persample <- outlist.persample[-index,]
+  }
+
+  # save(outlist.persample, file=paste(outdir, paste(sampname, "_", scanmode, ".RData", sep=""), sep="/"))
+  save(outlist.persample, file=paste("./", sampname, "_", scanmode, ".RData", sep=""))
+
+  cat(paste("There were", values$spikes, "spikes!"))
+}