Skip to content

Commit

Permalink
Merge branch 'master' into alignment
Browse files Browse the repository at this point in the history
  • Loading branch information
hechth authored Jul 23, 2024
2 parents ee1d2dc + 4cadd1d commit 47000dd
Show file tree
Hide file tree
Showing 9 changed files with 152 additions and 139 deletions.
143 changes: 99 additions & 44 deletions R/adjust.time.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,16 @@ NULL
compute_comb <- function(template_features, features) {
combined <- dplyr::bind_rows(
template_features,
dplyr::bind_cols(features |> dplyr::select(c(mz, rt)), sample_id = features$sample_id)
)
combined <- combined |> dplyr::arrange_at("mz")
features |> dplyr::select(c(mz, rt, cluster, sample_id))
) |> dplyr::arrange_at(c("cluster","mz"))
return(combined)
}

#' @export
compute_sel <- function(combined, mz_tol_relative, rt_tol_relative) {
compute_sel <- function(combined) {
l <- nrow(combined)
sel <- which(combined$mz[2:l] - combined$mz[1:(l - 1)] <
mz_tol_relative * combined$mz[1:(l - 1)] * 2 &
abs(combined$rt[2:l] - combined$rt[1:(l - 1)]) <
rt_tol_relative & combined$sample_id[2:l] != combined$sample_id[1:(l - 1)])
sel <- which(combined$cluster[1:(l - 1)] == combined$cluster[2:l] &
combined$sample_id[1:(l - 1)] != combined$sample_id[2:l])
return(sel)
}

Expand All @@ -32,33 +29,65 @@ compute_template_adjusted_rt <- function(combined, sel, j) {

# now the first column is the template retention time.
# the second column is the to-be-adjusted retention time

all_features <- all_features[order(all_features[, 2]), ]
return(all_features)
}

compute_corrected_features_v2 <- function(features, template_rt, delta_rt) {
features <- features |> dplyr::arrange_at(c("rt", "mz"))
idx <- dplyr::between(features$rt, min(template_rt), max(template_rt))
to_correct <- (features |> dplyr::filter(idx))$rt

this.smooth <- ksmooth(
template_rt,
delta_rt,
kernel = "normal",
bandwidth = (max(delta_rt) - min(delta_rt)) / 5,
x.points = to_correct
)

lower_bound_adjustment <- mean(this.smooth$y[this.smooth$x == min(this.smooth$x)])
upper_bound_adjustment <- mean(this.smooth$y[this.smooth$x == max(this.smooth$x)])

features <- features |>
dplyr::mutate(rt = dplyr::case_when(
rt < min(template_rt) ~ rt + lower_bound_adjustment,
rt > max(template_rt) ~ rt + upper_bound_adjustment
))
features[idx, "rt"] <- to_correct + this.smooth$y
return(features |> dplyr::arrange_at(c("mz", "rt")))
}

#' @export
compute_corrected_features <- function(features, delta_rt, avg_time) {
features <- features[order(features$rt, features$mz), ]
features <- features |> dplyr::arrange_at(c("rt", "mz"))

corrected <- features$rt
original <- features$rt
to_correct <- original[original >= min(delta_rt) &
original <= max(delta_rt)]

this.smooth <- ksmooth(delta_rt, avg_time,
idx <- dplyr::between(original, min(delta_rt), max(delta_rt))
to_correct <- original[idx]
this.smooth <- ksmooth(
delta_rt,
avg_time,
kernel = "normal",
bandwidth = (max(delta_rt) - min(delta_rt)) / 5,
x.points = to_correct
)

corrected[dplyr::between(original, min(delta_rt), max(delta_rt))] <-
this.smooth$y + to_correct
corrected[original < min(delta_rt)] <- corrected[original < min(delta_rt)] +
mean(this.smooth$y[this.smooth$x == min(this.smooth$x)])
corrected[original > max(delta_rt)] <- corrected[original > max(delta_rt)] +
mean(this.smooth$y[this.smooth$x == max(this.smooth$x)])
corrected[idx] <- this.smooth$y + to_correct
lower_bound_adjustment <- mean(this.smooth$y[this.smooth$x == min(this.smooth$x)])
upper_bound_adjustment <- mean(this.smooth$y[this.smooth$x == max(this.smooth$x)])

idx_lower <- original < min(delta_rt)
idx_upper <- original > max(delta_rt)

corrected[idx_lower] <- corrected[idx_lower] + lower_bound_adjustment
corrected[idx_upper] <- corrected[idx_upper] + upper_bound_adjustment
features$rt <- corrected
features <- features[order(features$mz, features$rt), ]

return(features)
}

Expand All @@ -76,36 +105,25 @@ fill_missing_values <- function(orig.feature, this.feature) {
}

#' @export
compute_template <- function(extracted_features) {
num.ftrs <- sapply(extracted_features, nrow)
template_id <- which.max(num.ftrs)
template <- extracted_features[[template_id]]$sample_id[1]
message(paste("the template is sample", template))

candi <- tibble::as_tibble(extracted_features[[template_id]]) |> dplyr::select(c(mz, rt))
template_features <- dplyr::bind_cols(candi, sample_id = rep(template, nrow(candi)))
return(tibble::as_tibble(template_features))
}

#' @export
correct_time <- function(this.feature, template_features, mz_tol_relative, rt_tol_relative) {
correct_time <- function(this.feature, template_features) {
orig.features <- this.feature
template <- unique(template_features$sample_id)[1]
j <- unique(this.feature$sample_id)[1]

if (j != template) {
this.comb <- compute_comb(template_features, this.feature)
sel <- compute_sel(this.comb, mz_tol_relative, rt_tol_relative)
sel <- compute_sel(this.comb)

if (length(sel) < 20) {
stop("too few, aborted")
} else {
all.ftr.table <- compute_template_adjusted_rt(this.comb, sel, j)
# the to be adjusted time
this.diff <- all.ftr.table[, 2]
# the difference between the true time and the to-be-adjusted time
avg_time <- all.ftr.table[, 1] - this.diff
this.feature <- compute_corrected_features(this.feature, this.diff, avg_time)

this.feature <- compute_corrected_features(
this.feature,
all.ftr.table[, 2], # the to be adjusted time
all.ftr.table[, 1] - all.ftr.table[, 2] # the difference between the true time and the to-be-adjusted time
)
}
}

Expand All @@ -119,6 +137,48 @@ correct_time <- function(this.feature, template_features, mz_tol_relative, rt_to
return(tibble::as_tibble(this.feature, column_name = c("mz", "rt", "sd1", "sd2", "area", "sample_id", "cluster")))
}

#' @export
compute_template <- function(extracted_features) {
num.ftrs <- sapply(extracted_features, nrow)
template_id <- which.max(num.ftrs)
template <- extracted_features[[template_id]]$sample_id[1]
message(paste("the template is sample", template))

candi <- tibble::as_tibble(extracted_features[[template_id]]) |> dplyr::select(c(mz, rt, cluster))
template_features <- dplyr::bind_cols(candi, sample_id = rep(template, nrow(candi)))
return(tibble::as_tibble(template_features))
}

correct_time_v2 <- function(features, template) {
if (unique(features$sample_id) == unique(template$sample_id))
return(tibble::as_tibble(features))

subsets <- template |>
dplyr::bind_rows(
features |> dplyr::select(c(mz, rt, cluster, sample_id))
) |>
dplyr::arrange_at(c("cluster", "mz")) |>
dplyr::group_by(cluster) |>
dplyr::mutate(count = dplyr::n_distinct(sample_id)) |>
filter(count == 2) |>
dplyr::add_count() |>
filter(n == 2) |>
dplyr::ungroup() |>
dplyr::group_by(sample_id) |>
dplyr::group_split()

all_features_new <- cbind(subsets[[1]]$rt, subsets[[2]]$rt)
all_features_new_order <- order(all_features_new[, 2])
all_features_new_arranged <- all_features_new[all_features_new_order,]

corrected <- compute_corrected_features_v2(
features,
all_features_new_arranged[, 2],
all_features_new_arranged[, 1] - all_features_new_arranged[, 2]
)
return(tibble::as_tibble(corrected))
}

#' Adjust retention time across spectra.
#'
#' This function adjusts the retention time in each LC/MS profile to achieve better between-profile agreement.
Expand All @@ -135,8 +195,6 @@ correct_time <- function(this.feature, template_features, mz_tol_relative, rt_to
#' column in each of the matrices is changed to new adjusted values.
#' @export
adjust.time <- function(extracted_features,
mz_tol_relative,
rt_tol_relative,
colors = NA,
do.plot = TRUE) {
number_of_samples <- length(extracted_features)
Expand All @@ -154,17 +212,14 @@ adjust.time <- function(extracted_features,

corrected_features <- foreach::foreach(features = extracted_features) %do% correct_time(
features,
template_features,
mz_tol_relative,
rt_tol_relative
template_features
)

if (do.plot) {
draw_rt_correction_plot(
colors,
extracted_features,
corrected_features,
rt_tol_relative
)
}

Expand Down
8 changes: 2 additions & 6 deletions R/hybrid.R
Original file line number Diff line number Diff line change
Expand Up @@ -392,9 +392,7 @@ hybrid <- function(
message("**** time correction ****")
corrected <- foreach::foreach(this.feature = extracted_clusters$feature_tables) %dopar% correct_time(
this.feature,
template_features,
extracted_clusters$mz_tol_relative,
extracted_clusters$rt_tol_relative
template_features
)

message("**** computing clusters ****")
Expand Down Expand Up @@ -472,9 +470,7 @@ hybrid <- function(
message("**** second time correction ****")
corrected <- foreach::foreach(this.feature = recovered_clusters$feature_tables) %dopar% correct_time(
this.feature,
template_features,
recovered_clusters$mz_tol_relative,
recovered_clusters$rt_tol_relative
template_features
)

message("**** fourth computing clusters ****")
Expand Down
4 changes: 1 addition & 3 deletions R/unsupervised.R
Original file line number Diff line number Diff line change
Expand Up @@ -182,9 +182,7 @@ unsupervised <- function(
message("**** time correction ****")
corrected <- foreach::foreach(this.feature = extracted_clusters$feature_tables) %dopar% correct_time(
this.feature,
template_features,
extracted_clusters$mz_tol_relative,
extracted_clusters$rt_tol_relative
template_features
)

message("**** computing clusters ****")
Expand Down
10 changes: 10 additions & 0 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -157,3 +157,13 @@ get_num_workers <- function() {
}
return(num_workers)
}

read_parquet_files <- function(filename, folder, pattern) {
testdata <- file.path("..", "testdata")

input <- lapply(filename, function(x) {
tibble::as_tibble(arrow::read_parquet(file.path(testdata, folder, paste0(x, pattern))))
})

return(input)
}
52 changes: 26 additions & 26 deletions tests/testthat/test-adjust-time.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,7 @@ patrick::with_parameters_test_that(
{
testdata <- file.path("..", "testdata")

filenames <- lapply(files, function(x) {
file.path(testdata, "clusters", paste0(x, "_extracted_clusters.parquet"))
})

extracted <- lapply(filenames, arrow::read_parquet)
extracted <- read_parquet_files(files, "clusters", "_extracted_clusters.parquet")
template_features <- compute_template(extracted)

expected <- file.path(testdata, "template", "RCX_shortened.parquet")
Expand All @@ -30,31 +26,35 @@ patrick::with_parameters_test_that(
template_features <- file.path(testdata, "template", "RCX_shortened.parquet")
template_features <- arrow::read_parquet(template_features)

extracted <- file.path(testdata, "clusters", paste0(.test_name, "_extracted_clusters.parquet"))
extracted <- arrow::read_parquet(extracted)
extracted <- read_parquet_files(files, "clusters", "_extracted_clusters.parquet")

corrected <- correct_time(
this.feature = extracted,
template_features = template_features,
mz_tol_relative = mz_tol_relative,
rt_tol_relative = rt_tol_relative
)
corrected <- lapply(extracted, function(x){
correct_time(x, template_features)
})

expected <- tibble::as_tibble(arrow::read_parquet(file.path(testdata, "adjusted", paste0(.test_name, ".parquet"))))
expect_equal(corrected, expected)
expected <- read_parquet_files(files, "adjusted", ".parquet")
expect_equal(corrected, expected, tolerance = 0.01)
},
patrick::cases(
RCX_06_shortened = list(
mz_tol_relative = 6.856763e-06,
rt_tol_relative = 3.618581
),
RCX_07_shortened = list(
mz_tol_relative = 6.856763e-06,
rt_tol_relative = 3.618581
),
RCX_08_shortened = list(
mz_tol_relative = 6.856763e-06,
rt_tol_relative = 3.618581
RCX_shortened = list(
files = c("RCX_06_shortened", "RCX_07_shortened", "RCX_08_shortened")
)
)
)

test_that("correct_time_v2 is close to correct time", {
template_features <- arrow::read_parquet(
file.path("..", "testdata", "template", "RCX_shortened.parquet")
)

clustered_table <- read_parquet_files(
c("RCX_06_shortened"),
"clusters",
"_extracted_clusters.parquet"
)[[1]]

expected <- correct_time(clustered_table, template_features)
actual <- correct_time_v2(clustered_table, template_features)

expect_equal(actual, expected, tolerance = 0.02)
})
21 changes: 5 additions & 16 deletions tests/testthat/test-compute_clusters.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,7 @@ patrick::with_parameters_test_that(
{
testdata <- file.path("..", "testdata")

extracted <- lapply(files, function(x) {
xx <- file.path(testdata, input, paste0(x, ".parquet"))
tibble::as_tibble(arrow::read_parquet(xx))
})
extracted <- read_parquet_files(files, input, ".parquet")

actual <- compute_clusters(
feature_tables = extracted,
Expand All @@ -18,10 +15,7 @@ patrick::with_parameters_test_that(
sample_names = files
)

expected <- lapply(files, function(x) {
filepath <- file.path(testdata, "clusters", paste0(x, "_", input, "_clusters.parquet"))
tibble::as_tibble(arrow::read_parquet(filepath))
})
expected <- read_parquet_files(files, "clusters", paste0("_", input, "_clusters.parquet"))


for(i in seq_along(files)) {
Expand Down Expand Up @@ -56,10 +50,8 @@ test_that("compute clusters simple", {
testdata <- file.path("..", "testdata")
files <- c("8_qc_no_dil_milliq", "21_qc_no_dil_milliq", "29_qc_no_dil_milliq")

extracted <- lapply(files, function(x) {
xx <- file.path(testdata, "extracted", paste0(x, ".mzml.parquet"))
tibble::as_tibble(arrow::read_parquet(xx))
})
extracted <- read_parquet_files(files, "extracted", ".mzml.parquet")

actual <- compute_clusters_simple(
feature_tables = extracted,
sample_names = files,
Expand All @@ -69,10 +61,7 @@ test_that("compute clusters simple", {

actual <- actual[order(sapply(actual, function(x) x$sample_id[1]))]

expected <- lapply(files, function(x) {
file <- file.path(testdata, "clusters", paste0(x, ".parquet"))
tibble::as_tibble(arrow::read_parquet(file))
})
expected <- read_parquet_files(files, "clusters", ".parquet")

expected <- expected[order(sapply(expected, function(x) x$sample_id[1]))]

Expand Down
Loading

0 comments on commit 47000dd

Please sign in to comment.