From e8d4bebbe5f7e33a3356cfb4da90be5c357a3dd9 Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Mon, 29 Apr 2024 09:15:42 +0200 Subject: [PATCH] update pse --- R/common.R | 164 +++++++++++++++++++++++++---------------------------- 1 file changed, 76 insertions(+), 88 deletions(-) diff --git a/R/common.R b/R/common.R index ff18ae8..15b246b 100644 --- a/R/common.R +++ b/R/common.R @@ -133,10 +133,10 @@ NRC <- function(a, b) { return(1-res) } -## modified from QUILT calculate_pse_2ways <- function(test, truth, which_snps, + i_option = 0, seed = NULL) { ## for testing if (!is.null(seed)) set.seed(seed) @@ -156,87 +156,78 @@ calculate_pse_2ways <- function(test, ## as these sites, discrepency as well disc <- sum(rowSums(test) != 1) ## round test data for now. choose hets at random - ## for homs - testO <- test - truthO <- truth - for (i_option in 1:2) { - test <- testO - truth <- truthO - ## specifically remove from consideration double phase switch errors - w <- rowSums(test) == 1 - w2 <- which(diff(abs(test[w, 1] - truth[w, 1])) != 0) - to_remove <- NULL - if (length(w2) > 0) { - w3 <- which(diff(w2) == 1) - if (length(w3) > 0) { - for (a in w3) { - c <- w2[c(a, a + 1)] - to_remove <- c(to_remove, which(w)[c]) - } + ## specifically remove from consideration double phase switch errors + w <- rowSums(test) == 1 + w2 <- which(diff(abs(test[w, 1] - truth[w, 1])) != 0) + to_remove <- NULL + if (length(w2) > 0) { + w3 <- which(diff(w2) == 1) + if (length(w3) > 0) { + for (a in w3) { + c <- w2[c(a, a + 1)] + to_remove <- c(to_remove, which(w)[c]) } } - ## double pse are two consecutive - if (length(to_remove) > 0) { - test <- test[-to_remove, ] - truth <- truth[-to_remove, ] - snps <- snps[-to_remove] - } - ## - if (i_option == 1) { - ## only consider non-discrepent sites - ## chose best start - if (test[1, 1] != truth[1, 1]) { - test <- test[, c(2, 1)] - } - ## calculate number of differences - w <- rowSums(test) == 1 - if (sum(w) == 0) { - warning("Test has no hets! possibly an error or homo over region") - return(NA) - ## switches1 <- cbind(i1 = NA, i2 = NA, l1 = NA, l2 = NA) - phase_errors_def1 <- 0 - phase_sites_def1 <- 0 - } else { - y <- diff(abs(test[w, 1] - truth[w, 1])) != 0 - snps <- snps[y] - phase_errors_def1 <- sum(y) - phase_sites_def1 <- sum(w) - 1 - ## we need rownames(test) <- 1:nrow(test) beforehand - ## s <- as.integer(rownames(test[w, , drop = FALSE][c(as.logical(y), FALSE), , drop = FALSE])) - ## e <- as.integer(rownames(test[w, , drop = FALSE][c(FALSE, as.logical(y)), , drop = FALSE])) - ## switches1 <- cbind(i1 = s, i2 = e) - } + } + ## double pse are two consecutive + if (length(to_remove) > 0) { + test <- test[-to_remove, ] + truth <- truth[-to_remove, ] + snps <- snps[-to_remove] + } + ## + if (i_option == 0) { + ## only consider non-discrepent sites + ## chose best start + if (test[1, 1] != truth[1, 1]) { + test <- test[, c(2, 1)] } - if (i_option == 2) { - choose_at_random <- which(rowSums(test) != 1) - if (length(choose_at_random) > 0) { - test[choose_at_random, ] <- 0 - r <- sample( - c(1, 2), - length(choose_at_random), - replace = TRUE - ) - test[cbind(choose_at_random, r)] <- 1 - } - ## chose best start - if (test[1, 1] != truth[1, 1]) { - test <- test[, c(2, 1)] - } - ## calculate number of differences - y <- diff(abs(test[, 1] - truth[, 1])) != 0 + ## calculate number of differences + w <- rowSums(test) == 1 + if (sum(w) == 0) { + warning("Test has no hets! possibly an error or homo over region") + return(NA) + ## switches1 <- cbind(i1 = NA, i2 = NA, l1 = NA, l2 = NA) + phase_errors <- 0 + phase_sites <- 0 + } else { + y <- diff(abs(test[w, 1] - truth[w, 1])) != 0 snps <- snps[y] - phase_errors_def2 <- sum(y) - phase_sites_def2 <- nrow(test) - 1 + phase_errors <- sum(y) + phase_sites <- sum(w) - 1 + ## we need rownames(test) <- 1:nrow(test) beforehand + ## s <- as.integer(rownames(test[w, , drop = FALSE][c(as.logical(y), FALSE), , drop = FALSE])) + ## e <- as.integer(rownames(test[w, , drop = FALSE][c(FALSE, as.logical(y)), , drop = FALSE])) + ## switches1 <- cbind(i1 = s, i2 = e) + } + } + if (i_option == 1) { + choose_at_random <- which(rowSums(test) != 1) + if (length(choose_at_random) > 0) { + test[choose_at_random, ] <- 0 + r <- sample( + c(1, 2), + length(choose_at_random), + replace = TRUE + ) + test[cbind(choose_at_random, r)] <- 1 } + ## chose best start + if (test[1, 1] != truth[1, 1]) { + test <- test[, c(2, 1)] + } + ## calculate number of differences + y <- diff(abs(test[, 1] - truth[, 1])) != 0 + snps <- snps[y] + phase_errors <- sum(y) + phase_sites <- nrow(test) - 1 } ## return( list( values = c( - phase_errors_def1 = phase_errors_def1, - phase_sites_def1 = phase_sites_def1, - phase_errors_def2 = phase_errors_def2, - phase_sites_def2 = phase_sites_def2, + phase_errors = phase_errors, + phase_sites = phase_sites, disc_errors = disc, dist_n = nrow(test) ), @@ -245,31 +236,28 @@ calculate_pse_2ways <- function(test, ) } -PSE <- function(a, b, +PSE <- function(test, truth, sites = NULL, choose_random_start = FALSE, return_pse_sites = TRUE) { - stopifnot(is.matrix(a) & is.matrix(b)) - stopifnot(ncol(a) %% 2 == 0) - N <- ncol(a) / 2 - if(is.null(sites)) sites <- 1:nrow(a) + stopifnot(is.matrix(test) & is.matrix(truth)) + stopifnot(ncol(test) %% 2 == 0) + N <- ncol(test) / 2 + if(is.null(sites)) sites <- 1:nrow(test) + pos <- NULL o <- lapply(1:N, function(i) { - truth <-a[,1:2+(i-1)*2] # truth - test <- b[,1:2+(i-1)*2] # test - res <- calculate_pse_2ways(test, truth, which_snps = sites) + itruth <-truth[,1:2+(i-1)*2] # truth + itest <- test[,1:2+(i-1)*2] # test + res <- calculate_pse_2ways(itest, itruth, which_snps = sites, i_option = choose_random_start) if(is.list(res)){ values <- res$values - if(choose_random_start) - pse <- values["phase_errors_def2"] / values["phase_sites_def2"] - else - pse <- values["phase_errors_def1"] / values["phase_sites_def1"] - pse <- round(pse, 3) - pos <- ifelse(return_pse_sites, res$sites, NA) - list(pse=pse, pos=pos) + pse <- values["phase_errors"] / values["phase_sites"] + disc <- values["disc_errors"] / values["dist_n"] + if(return_pse_sites) pos <- res$sites + list(pse=pse, disc=disc, pos=pos) } else { NA } }) return(o) } -