Merge branch 'master' into alignment

RECETOX · Jul 23, 2024 · 47000dd · 47000dd
2 parents ee1d2dc + 4cadd1d
commit 47000dd
Show file tree

Hide file tree

Showing 9 changed files with 152 additions and 139 deletions.
diff --git a/R/adjust.time.R b/R/adjust.time.R
@@ -6,19 +6,16 @@ NULL
 compute_comb <- function(template_features, features) {
   combined <- dplyr::bind_rows(
     template_features,
-    dplyr::bind_cols(features |> dplyr::select(c(mz, rt)), sample_id = features$sample_id)
-  )
-  combined <- combined |> dplyr::arrange_at("mz")
+    features |> dplyr::select(c(mz, rt, cluster, sample_id))
+  ) |> dplyr::arrange_at(c("cluster","mz"))
   return(combined)
 }
 
 #' @export
-compute_sel <- function(combined, mz_tol_relative, rt_tol_relative) {
+compute_sel <- function(combined) {
   l <- nrow(combined)
-  sel <- which(combined$mz[2:l] - combined$mz[1:(l - 1)] <
-    mz_tol_relative * combined$mz[1:(l - 1)] * 2 &
-    abs(combined$rt[2:l] - combined$rt[1:(l - 1)]) <
-      rt_tol_relative & combined$sample_id[2:l] != combined$sample_id[1:(l - 1)])
+  sel <- which(combined$cluster[1:(l - 1)] == combined$cluster[2:l] & 
+               combined$sample_id[1:(l - 1)] != combined$sample_id[2:l])
   return(sel)
 }
 
@@ -32,33 +29,65 @@ compute_template_adjusted_rt <- function(combined, sel, j) {
 
   # now the first column is the template retention time.
   # the second column is the to-be-adjusted retention time
-  
+
   all_features <- all_features[order(all_features[, 2]), ]
   return(all_features)
 }
 
+compute_corrected_features_v2 <- function(features, template_rt, delta_rt) {
+  features <- features |> dplyr::arrange_at(c("rt", "mz"))
+  idx <- dplyr::between(features$rt, min(template_rt), max(template_rt))
+  to_correct <- (features |> dplyr::filter(idx))$rt
+
+  this.smooth <- ksmooth(
+    template_rt,
+    delta_rt,
+    kernel = "normal",
+    bandwidth = (max(delta_rt) - min(delta_rt)) / 5,
+    x.points = to_correct
+  )
+
+  lower_bound_adjustment <- mean(this.smooth$y[this.smooth$x == min(this.smooth$x)])
+  upper_bound_adjustment <- mean(this.smooth$y[this.smooth$x == max(this.smooth$x)])
+
+  features <- features |>
+    dplyr::mutate(rt = dplyr::case_when(
+      rt < min(template_rt) ~ rt + lower_bound_adjustment,
+      rt > max(template_rt) ~ rt + upper_bound_adjustment
+    ))
+  features[idx, "rt"] <- to_correct + this.smooth$y
+  return(features |> dplyr::arrange_at(c("mz", "rt")))
+}
+
 #' @export
 compute_corrected_features <- function(features, delta_rt, avg_time) {
-  features <- features[order(features$rt, features$mz), ]
+  features <- features |> dplyr::arrange_at(c("rt", "mz"))
+
   corrected <- features$rt
   original <- features$rt
-  to_correct <- original[original >= min(delta_rt) &
-    original <= max(delta_rt)]
 
-  this.smooth <- ksmooth(delta_rt, avg_time,
+  idx <- dplyr::between(original, min(delta_rt), max(delta_rt))
+  to_correct <- original[idx]
+  this.smooth <- ksmooth(
+    delta_rt,
+    avg_time,
     kernel = "normal",
     bandwidth = (max(delta_rt) - min(delta_rt)) / 5,
     x.points = to_correct
   )
 
-  corrected[dplyr::between(original, min(delta_rt), max(delta_rt))] <-
-    this.smooth$y + to_correct
-  corrected[original < min(delta_rt)] <- corrected[original < min(delta_rt)] +
-    mean(this.smooth$y[this.smooth$x == min(this.smooth$x)])
-  corrected[original > max(delta_rt)] <- corrected[original > max(delta_rt)] +
-    mean(this.smooth$y[this.smooth$x == max(this.smooth$x)])
+  corrected[idx] <- this.smooth$y + to_correct
+  lower_bound_adjustment <- mean(this.smooth$y[this.smooth$x == min(this.smooth$x)])
+  upper_bound_adjustment <- mean(this.smooth$y[this.smooth$x == max(this.smooth$x)])
+
+  idx_lower <- original < min(delta_rt)
+  idx_upper <- original > max(delta_rt)
+
+  corrected[idx_lower] <- corrected[idx_lower] + lower_bound_adjustment
+  corrected[idx_upper] <- corrected[idx_upper] + upper_bound_adjustment
   features$rt <- corrected
   features <- features[order(features$mz, features$rt), ]
+
   return(features)
 }
 
@@ -76,36 +105,25 @@ fill_missing_values <- function(orig.feature, this.feature) {
 }
 
 #' @export
-compute_template <- function(extracted_features) {
-  num.ftrs <- sapply(extracted_features, nrow)
-  template_id <- which.max(num.ftrs)
-  template <- extracted_features[[template_id]]$sample_id[1]
-  message(paste("the template is sample", template))
-
-  candi <- tibble::as_tibble(extracted_features[[template_id]]) |> dplyr::select(c(mz, rt))
-  template_features <- dplyr::bind_cols(candi, sample_id = rep(template, nrow(candi)))
-  return(tibble::as_tibble(template_features))
-}
-
-#' @export
-correct_time <- function(this.feature, template_features, mz_tol_relative, rt_tol_relative) {
+correct_time <- function(this.feature, template_features) {
     orig.features <- this.feature
     template <- unique(template_features$sample_id)[1]
     j <- unique(this.feature$sample_id)[1]
 
     if (j != template) {
       this.comb <- compute_comb(template_features, this.feature)
-      sel <- compute_sel(this.comb, mz_tol_relative, rt_tol_relative)
+      sel <- compute_sel(this.comb)
 
       if (length(sel) < 20) {
         stop("too few, aborted")
       } else {
         all.ftr.table <- compute_template_adjusted_rt(this.comb, sel, j)
-        # the to be adjusted time
-        this.diff <- all.ftr.table[, 2]
-        # the difference between the true time and the to-be-adjusted time
-        avg_time <- all.ftr.table[, 1] - this.diff
-        this.feature <- compute_corrected_features(this.feature, this.diff, avg_time)
+
+        this.feature <- compute_corrected_features(
+          this.feature,
+          all.ftr.table[, 2],  # the to be adjusted time
+          all.ftr.table[, 1] - all.ftr.table[, 2]  # the difference between the true time and the to-be-adjusted time
+        )
       }
     }
 
@@ -119,6 +137,48 @@ correct_time <- function(this.feature, template_features, mz_tol_relative, rt_to
   return(tibble::as_tibble(this.feature, column_name = c("mz", "rt", "sd1", "sd2", "area", "sample_id", "cluster")))
 }
 
+#' @export
+compute_template <- function(extracted_features) {
+  num.ftrs <- sapply(extracted_features, nrow)
+  template_id <- which.max(num.ftrs)
+  template <- extracted_features[[template_id]]$sample_id[1]
+  message(paste("the template is sample", template))
+
+  candi <- tibble::as_tibble(extracted_features[[template_id]]) |> dplyr::select(c(mz, rt, cluster))
+  template_features <- dplyr::bind_cols(candi, sample_id = rep(template, nrow(candi)))
+  return(tibble::as_tibble(template_features))
+}
+
+correct_time_v2 <- function(features, template) {
+  if (unique(features$sample_id) == unique(template$sample_id))
+    return(tibble::as_tibble(features))
+
+  subsets <- template |>
+    dplyr::bind_rows(
+      features |> dplyr::select(c(mz, rt, cluster, sample_id))
+    ) |>
+    dplyr::arrange_at(c("cluster", "mz")) |>
+    dplyr::group_by(cluster) |>
+    dplyr::mutate(count = dplyr::n_distinct(sample_id)) |>
+    filter(count == 2) |>
+    dplyr::add_count() |>
+    filter(n == 2) |>
+    dplyr::ungroup() |>
+    dplyr::group_by(sample_id) |>
+    dplyr::group_split()
+
+  all_features_new <- cbind(subsets[[1]]$rt, subsets[[2]]$rt)
+  all_features_new_order <- order(all_features_new[, 2])
+  all_features_new_arranged <- all_features_new[all_features_new_order,]
+
+  corrected <- compute_corrected_features_v2(
+    features,
+    all_features_new_arranged[, 2],
+    all_features_new_arranged[, 1] - all_features_new_arranged[, 2]
+  )
+  return(tibble::as_tibble(corrected))
+}
+
 #' Adjust retention time across spectra.
 #'
 #' This function adjusts the retention time in each LC/MS profile to achieve better between-profile agreement.
@@ -135,8 +195,6 @@ correct_time <- function(this.feature, template_features, mz_tol_relative, rt_to
 #'  column in each of the matrices is changed to new adjusted values.
 #' @export
 adjust.time <- function(extracted_features,
-                        mz_tol_relative,
-                        rt_tol_relative,
                         colors = NA,
                         do.plot = TRUE) {
   number_of_samples <- length(extracted_features)
@@ -154,17 +212,14 @@ adjust.time <- function(extracted_features,
 
   corrected_features <- foreach::foreach(features = extracted_features) %do% correct_time(
     features,
-    template_features,
-    mz_tol_relative,
-    rt_tol_relative
+    template_features
   )
 
   if (do.plot) {
     draw_rt_correction_plot(
       colors,
       extracted_features,
       corrected_features,
-      rt_tol_relative
     )
   }
 

diff --git a/R/hybrid.R b/R/hybrid.R
@@ -392,9 +392,7 @@ hybrid <- function(
   message("**** time correction ****")
   corrected <- foreach::foreach(this.feature = extracted_clusters$feature_tables) %dopar% correct_time(
     this.feature,
-    template_features,
-    extracted_clusters$mz_tol_relative,
-    extracted_clusters$rt_tol_relative
+    template_features
   )
 
   message("**** computing clusters ****")
@@ -472,9 +470,7 @@ hybrid <- function(
   message("**** second time correction ****")
   corrected <- foreach::foreach(this.feature = recovered_clusters$feature_tables) %dopar% correct_time(
     this.feature,
-    template_features,
-    recovered_clusters$mz_tol_relative,
-    recovered_clusters$rt_tol_relative
+    template_features
   )
 
   message("**** fourth computing clusters ****")

diff --git a/R/unsupervised.R b/R/unsupervised.R
@@ -182,9 +182,7 @@ unsupervised <- function(
   message("**** time correction ****")
   corrected <- foreach::foreach(this.feature = extracted_clusters$feature_tables) %dopar% correct_time(
     this.feature,
-    template_features,
-    extracted_clusters$mz_tol_relative,
-    extracted_clusters$rt_tol_relative
+    template_features
   )
 
   message("**** computing clusters ****")

diff --git a/R/utils.R b/R/utils.R
@@ -157,3 +157,13 @@ get_num_workers <- function() {
     }
     return(num_workers)
 }
+
+read_parquet_files <- function(filename, folder, pattern) {
+  testdata <- file.path("..", "testdata")
+
+  input <- lapply(filename, function(x) {
+    tibble::as_tibble(arrow::read_parquet(file.path(testdata, folder, paste0(x, pattern))))
+  })
+
+  return(input)
+}
diff --git a/tests/testthat/test-adjust-time.R b/tests/testthat/test-adjust-time.R
@@ -3,11 +3,7 @@ patrick::with_parameters_test_that(
   {
     testdata <- file.path("..", "testdata")
 
-    filenames <- lapply(files, function(x) {
-      file.path(testdata, "clusters", paste0(x, "_extracted_clusters.parquet"))
-    })
-
-    extracted <- lapply(filenames, arrow::read_parquet)
+    extracted <- read_parquet_files(files, "clusters", "_extracted_clusters.parquet")
     template_features <- compute_template(extracted)
 
     expected <- file.path(testdata, "template", "RCX_shortened.parquet")
@@ -30,31 +26,35 @@ patrick::with_parameters_test_that(
     template_features <- file.path(testdata, "template", "RCX_shortened.parquet")
     template_features <- arrow::read_parquet(template_features)
 
-    extracted <- file.path(testdata, "clusters", paste0(.test_name, "_extracted_clusters.parquet"))
-    extracted <- arrow::read_parquet(extracted)
+    extracted <- read_parquet_files(files, "clusters", "_extracted_clusters.parquet")
 
-    corrected <- correct_time(
-      this.feature = extracted,
-      template_features = template_features,
-      mz_tol_relative = mz_tol_relative,
-      rt_tol_relative = rt_tol_relative
-    )
+    corrected <- lapply(extracted, function(x){
+      correct_time(x, template_features)
+    })
 
-    expected <- tibble::as_tibble(arrow::read_parquet(file.path(testdata, "adjusted", paste0(.test_name, ".parquet"))))
-    expect_equal(corrected, expected)
+    expected <- read_parquet_files(files, "adjusted", ".parquet")
+    expect_equal(corrected, expected, tolerance = 0.01)
   },
   patrick::cases(
-    RCX_06_shortened = list(
-      mz_tol_relative = 6.856763e-06,
-      rt_tol_relative = 3.618581
-    ),
-    RCX_07_shortened = list(
-      mz_tol_relative = 6.856763e-06,
-      rt_tol_relative = 3.618581
-    ),
-    RCX_08_shortened = list(
-      mz_tol_relative = 6.856763e-06,
-      rt_tol_relative = 3.618581
+    RCX_shortened = list(
+      files = c("RCX_06_shortened", "RCX_07_shortened", "RCX_08_shortened")
     )
   )
 )
+
+test_that("correct_time_v2 is close to correct time", {
+  template_features <- arrow::read_parquet(
+    file.path("..", "testdata", "template", "RCX_shortened.parquet")
+  )
+
+  clustered_table <- read_parquet_files(
+    c("RCX_06_shortened"),
+    "clusters",
+    "_extracted_clusters.parquet"
+  )[[1]]
+
+  expected <- correct_time(clustered_table, template_features)
+  actual <- correct_time_v2(clustered_table, template_features)
+
+  expect_equal(actual, expected, tolerance = 0.02)
+})
diff --git a/tests/testthat/test-compute_clusters.R b/tests/testthat/test-compute_clusters.R
@@ -3,10 +3,7 @@ patrick::with_parameters_test_that(
   {
     testdata <- file.path("..", "testdata")
 
-    extracted <- lapply(files, function(x) {
-      xx <- file.path(testdata, input, paste0(x, ".parquet"))
-      tibble::as_tibble(arrow::read_parquet(xx))
-    })
+    extracted <- read_parquet_files(files, input, ".parquet")
 
     actual <- compute_clusters(
       feature_tables = extracted,
@@ -18,10 +15,7 @@ patrick::with_parameters_test_that(
       sample_names = files
     )
 
-    expected <- lapply(files, function(x) {
-      filepath <- file.path(testdata, "clusters", paste0(x, "_", input, "_clusters.parquet"))
-      tibble::as_tibble(arrow::read_parquet(filepath))
-    })
+    expected <- read_parquet_files(files, "clusters", paste0("_", input, "_clusters.parquet"))
 
 
     for(i in seq_along(files)) {
@@ -56,10 +50,8 @@ test_that("compute clusters simple", {
   testdata <- file.path("..", "testdata")
   files <- c("8_qc_no_dil_milliq", "21_qc_no_dil_milliq", "29_qc_no_dil_milliq")
 
-  extracted <- lapply(files, function(x) {
-    xx <- file.path(testdata, "extracted", paste0(x, ".mzml.parquet"))
-    tibble::as_tibble(arrow::read_parquet(xx))
-  })
+  extracted <- read_parquet_files(files, "extracted", ".mzml.parquet")
+
   actual <- compute_clusters_simple(
     feature_tables = extracted,
     sample_names = files,
@@ -69,10 +61,7 @@ test_that("compute clusters simple", {
 
   actual <- actual[order(sapply(actual, function(x) x$sample_id[1]))]
 
-  expected <- lapply(files, function(x) {
-    file <- file.path(testdata, "clusters", paste0(x, ".parquet"))
-    tibble::as_tibble(arrow::read_parquet(file))
-  })
+  expected <- read_parquet_files(files, "clusters", ".parquet")
 
   expected <- expected[order(sapply(expected, function(x) x$sample_id[1]))]