Skip to content

Commit

Permalink
update pse
Browse files Browse the repository at this point in the history
  • Loading branch information
Zilong-Li committed Apr 29, 2024
1 parent 11f26fd commit e8d4beb
Showing 1 changed file with 76 additions and 88 deletions.
164 changes: 76 additions & 88 deletions R/common.R
Original file line number Diff line number Diff line change
Expand Up @@ -133,10 +133,10 @@ NRC <- function(a, b) {
return(1-res)
}

## modified from QUILT
calculate_pse_2ways <- function(test,
truth,
which_snps,
i_option = 0,
seed = NULL) {
## for testing
if (!is.null(seed)) set.seed(seed)
Expand All @@ -156,87 +156,78 @@ calculate_pse_2ways <- function(test,
## as these sites, discrepency as well
disc <- sum(rowSums(test) != 1)
## round test data for now. choose hets at random
## for homs
testO <- test
truthO <- truth
for (i_option in 1:2) {
test <- testO
truth <- truthO
## specifically remove from consideration double phase switch errors
w <- rowSums(test) == 1
w2 <- which(diff(abs(test[w, 1] - truth[w, 1])) != 0)
to_remove <- NULL
if (length(w2) > 0) {
w3 <- which(diff(w2) == 1)
if (length(w3) > 0) {
for (a in w3) {
c <- w2[c(a, a + 1)]
to_remove <- c(to_remove, which(w)[c])
}
## specifically remove from consideration double phase switch errors
w <- rowSums(test) == 1
w2 <- which(diff(abs(test[w, 1] - truth[w, 1])) != 0)
to_remove <- NULL
if (length(w2) > 0) {
w3 <- which(diff(w2) == 1)
if (length(w3) > 0) {
for (a in w3) {
c <- w2[c(a, a + 1)]
to_remove <- c(to_remove, which(w)[c])
}
}
## double pse are two consecutive
if (length(to_remove) > 0) {
test <- test[-to_remove, ]
truth <- truth[-to_remove, ]
snps <- snps[-to_remove]
}
##
if (i_option == 1) {
## only consider non-discrepent sites
## chose best start
if (test[1, 1] != truth[1, 1]) {
test <- test[, c(2, 1)]
}
## calculate number of differences
w <- rowSums(test) == 1
if (sum(w) == 0) {
warning("Test has no hets! possibly an error or homo over region")
return(NA)
## switches1 <- cbind(i1 = NA, i2 = NA, l1 = NA, l2 = NA)
phase_errors_def1 <- 0
phase_sites_def1 <- 0
} else {
y <- diff(abs(test[w, 1] - truth[w, 1])) != 0
snps <- snps[y]
phase_errors_def1 <- sum(y)
phase_sites_def1 <- sum(w) - 1
## we need rownames(test) <- 1:nrow(test) beforehand
## s <- as.integer(rownames(test[w, , drop = FALSE][c(as.logical(y), FALSE), , drop = FALSE]))
## e <- as.integer(rownames(test[w, , drop = FALSE][c(FALSE, as.logical(y)), , drop = FALSE]))
## switches1 <- cbind(i1 = s, i2 = e)
}
}
## double pse are two consecutive
if (length(to_remove) > 0) {
test <- test[-to_remove, ]
truth <- truth[-to_remove, ]
snps <- snps[-to_remove]
}
##
if (i_option == 0) {
## only consider non-discrepent sites
## chose best start
if (test[1, 1] != truth[1, 1]) {
test <- test[, c(2, 1)]
}
if (i_option == 2) {
choose_at_random <- which(rowSums(test) != 1)
if (length(choose_at_random) > 0) {
test[choose_at_random, ] <- 0
r <- sample(
c(1, 2),
length(choose_at_random),
replace = TRUE
)
test[cbind(choose_at_random, r)] <- 1
}
## chose best start
if (test[1, 1] != truth[1, 1]) {
test <- test[, c(2, 1)]
}
## calculate number of differences
y <- diff(abs(test[, 1] - truth[, 1])) != 0
## calculate number of differences
w <- rowSums(test) == 1
if (sum(w) == 0) {
warning("Test has no hets! possibly an error or homo over region")
return(NA)
## switches1 <- cbind(i1 = NA, i2 = NA, l1 = NA, l2 = NA)
phase_errors <- 0
phase_sites <- 0
} else {
y <- diff(abs(test[w, 1] - truth[w, 1])) != 0
snps <- snps[y]
phase_errors_def2 <- sum(y)
phase_sites_def2 <- nrow(test) - 1
phase_errors <- sum(y)
phase_sites <- sum(w) - 1
## we need rownames(test) <- 1:nrow(test) beforehand
## s <- as.integer(rownames(test[w, , drop = FALSE][c(as.logical(y), FALSE), , drop = FALSE]))
## e <- as.integer(rownames(test[w, , drop = FALSE][c(FALSE, as.logical(y)), , drop = FALSE]))
## switches1 <- cbind(i1 = s, i2 = e)
}
}
if (i_option == 1) {
choose_at_random <- which(rowSums(test) != 1)
if (length(choose_at_random) > 0) {
test[choose_at_random, ] <- 0
r <- sample(
c(1, 2),
length(choose_at_random),
replace = TRUE
)
test[cbind(choose_at_random, r)] <- 1
}
## chose best start
if (test[1, 1] != truth[1, 1]) {
test <- test[, c(2, 1)]
}
## calculate number of differences
y <- diff(abs(test[, 1] - truth[, 1])) != 0
snps <- snps[y]
phase_errors <- sum(y)
phase_sites <- nrow(test) - 1
}
##
return(
list(
values = c(
phase_errors_def1 = phase_errors_def1,
phase_sites_def1 = phase_sites_def1,
phase_errors_def2 = phase_errors_def2,
phase_sites_def2 = phase_sites_def2,
phase_errors = phase_errors,
phase_sites = phase_sites,
disc_errors = disc,
dist_n = nrow(test)
),
Expand All @@ -245,31 +236,28 @@ calculate_pse_2ways <- function(test,
)
}

PSE <- function(a, b,
PSE <- function(test, truth,
sites = NULL,
choose_random_start = FALSE,
return_pse_sites = TRUE) {
stopifnot(is.matrix(a) & is.matrix(b))
stopifnot(ncol(a) %% 2 == 0)
N <- ncol(a) / 2
if(is.null(sites)) sites <- 1:nrow(a)
stopifnot(is.matrix(test) & is.matrix(truth))
stopifnot(ncol(test) %% 2 == 0)
N <- ncol(test) / 2
if(is.null(sites)) sites <- 1:nrow(test)
pos <- NULL
o <- lapply(1:N, function(i) {
truth <-a[,1:2+(i-1)*2] # truth
test <- b[,1:2+(i-1)*2] # test
res <- calculate_pse_2ways(test, truth, which_snps = sites)
itruth <-truth[,1:2+(i-1)*2] # truth
itest <- test[,1:2+(i-1)*2] # test
res <- calculate_pse_2ways(itest, itruth, which_snps = sites, i_option = choose_random_start)
if(is.list(res)){
values <- res$values
if(choose_random_start)
pse <- values["phase_errors_def2"] / values["phase_sites_def2"]
else
pse <- values["phase_errors_def1"] / values["phase_sites_def1"]
pse <- round(pse, 3)
pos <- ifelse(return_pse_sites, res$sites, NA)
list(pse=pse, pos=pos)
pse <- values["phase_errors"] / values["phase_sites"]
disc <- values["disc_errors"] / values["dist_n"]
if(return_pse_sites) pos <- res$sites
list(pse=pse, disc=disc, pos=pos)
} else {
NA
}
})
return(o)
}

0 comments on commit e8d4beb

Please sign in to comment.