Skip to content

Commit

Permalink
Merge pull request #430 from Qile0317/429
Browse files Browse the repository at this point in the history
Fix TRUST4 loadContigs issue in #429
  • Loading branch information
ncborcherding authored Oct 28, 2024
2 parents 18a2d41 + c18c1d0 commit 4845b7c
Show file tree
Hide file tree
Showing 11 changed files with 214 additions and 185 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/R-CMD-check.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ on:
push:
branches: [main, master, v2]
pull_request:
branches: [main, master, v2]
branches: [main, master, v2, dev]

name: R-CMD-check

Expand Down
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,6 @@ local_tests.R
docs
vignettes/articles/scRep_example_full.rds
.vscode
qile
qile
dev
.lintr
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
* Removed unnecessary code remnant in ```clonalLength()```
* Allow one sample to be plotted by ```percentVJ()```
* Fixed issue with ```positionalProperty()``` and exportTable
* Fixed issue with ```loadContigs()``` edgecase when TRUST4 data only has 1 row.

# scRepertoire VERSION 2.0.7

Expand Down
234 changes: 129 additions & 105 deletions R/loadContigs.R
Original file line number Diff line number Diff line change
@@ -1,124 +1,156 @@
#' Loading the contigs derived from single-cell sequencing
#'
#' This function generates a contig list and formats the data to allow for
#' function with \code{\link{combineTCR}} or \code{\link{combineBCR}}. If
#' using data derived from filtered outputs of 10X Genomics, there is no
#' need to use this function as the data is already compatible.
#'
#' The files that this function parses includes:
#' This function generates a contig list and formats the data to allow for
#' function with \code{\link{combineTCR}} or \code{\link{combineBCR}}. If
#' using data derived from filtered outputs of 10X Genomics, there is no
#' need to use this function as the data is already compatible.
#'
#' The files that this function parses includes:
#' \itemize{
#' \item 10X = "filtered_contig_annotations.csv"
#' \item AIRR = "airr_rearrangement.tsv"
#' \item BD = "Contigs_AIRR.tsv"
#' \item Dandelion = "all_contig_dandelion.tsv"
#' \item Immcantation = "data.tsv"
#' \item AIRR = "airr_rearrangement.tsv"
#' \item BD = "Contigs_AIRR.tsv"
#' \item Dandelion = "all_contig_dandelion.tsv"
#' \item Immcantation = "data.tsv"
#' \item JSON = ".json"
#' \item ParseBio = "barcode_report.tsv"
#' \item MiXCR = "clones.tsv"
#' \item Omniscope = ".csv"
#' \item Omniscope = ".csv"
#' \item TRUST4 = "barcode_report.tsv"
#' \item WAT3R = "barcode_results.csv"
#' \item WAT3R = "barcode_results.csv"
#' }
#'
#'
#' @examples
#' TRUST4 <- read.csv("https://www.borch.dev/uploads/contigs/TRUST4_contigs.csv")
#' contig.list <- loadContigs(TRUST4, format = "TRUST4")
#'
#'
#' BD <- read.csv("https://www.borch.dev/uploads/contigs/BD_contigs.csv")
#' contig.list <- loadContigs(BD, format = "BD")
#'
#'
#' WAT3R <- read.csv("https://www.borch.dev/uploads/contigs/WAT3R_contigs.csv")
#' contig.list <- loadContigs(WAT3R, format = "WAT3R")
#'
#' @param input The directory in which contigs are located or a list with contig elements
#' @param format The format of the single-cell contig, currently supporting:
#' "10X", "AIRR", "BD", "Dandelion", "JSON", "MiXCR", "ParseBio", "Omniscope", "TRUST4", and "WAT3R"
#'
#' @param input The directory in which contigs are located or a list with contig
#' elements
#' @param format The format of the single-cell contig, currently supporting:
#' "10X", "AIRR", "BD", "Dandelion", "JSON", "MiXCR", "ParseBio", "Omniscope",
#' "TRUST4", "WAT3R", and "Immcantation"
#' @importFrom utils read.csv read.delim
#' @importFrom rjson fromJSON
#' @export
#' @concept Loading_and_Processing_Contigs
#' @return List of contigs for compatibility with \code{\link{combineTCR}} or
#' @return List of contigs for compatibility with \code{\link{combineTCR}} or
#' \code{\link{combineBCR}}
loadContigs <- function(input,
format = "10X") {
#Loading from directory, recursively
if (inherits(x=input, what ="character")) {
format.list <- list("WAT3R" = "barcode_results.csv",
"10X" = "filtered_contig_annotations.csv",
"AIRR" = "airr_rearrangement.tsv",
"Dandelion" = "all_contig_dandelion.tsv",
"Immcantation" = "_data.tsv",
"MiXCR" = "clones.tsv",
"JSON" = ".json",
"TRUST4" = "barcode_report.tsv",
"BD" = "Contigs_AIRR.tsv",
"Omniscope" =c("_OSB.csv", "_OST.csv"),
"ParseBio" = "barcode_report.tsv")
loadContigs <- function(input, format = "10X") {

assert_that(is.string(input) || is.list(input) || is.data.frame(input))
assert_that(is.string(format))
assert_that(format %in% c(
"10X", "AIRR", "BD", "Dandelion", "JSON", "MiXCR", "ParseBio",
"Omniscope", "TRUST4", "WAT3R", "Immcantation"
))

#Loading from directory, recursively
rawDataDfList <- if (inherits(x = input, what = "character")) {

format.list <- list(
"WAT3R" = "barcode_results.csv",
"10X" = "filtered_contig_annotations.csv",
"AIRR" = "airr_rearrangement.tsv",
"Dandelion" = "all_contig_dandelion.tsv",
"Immcantation" = "_data.tsv",
"MiXCR" = "clones.tsv",
"JSON" = ".json",
"TRUST4" = "barcode_report.tsv",
"BD" = "Contigs_AIRR.tsv",
"Omniscope" = c("_OSB.csv", "_OST.csv"),
"ParseBio" = "barcode_report.tsv"
)
file.pattern <- format.list[[format]]
contig.files <- list.files(input, paste0(file.pattern, collapse = "|"), recursive = TRUE, full.names = TRUE)

if (format %in% c("10X", "WAT3R", "Omniscope")) {
df <- lapply(contig.files, read.csv)
} else if(format %in% c("json")) {
df <- lapply(contig.files, function(x) {
tmp <- as.data.frame(fromJSON(x))
})
contig.files <- list.files(
input,
paste0("*", file.pattern, "$", collapse = "|"),
recursive = TRUE,
full.names = TRUE
)

if (length(contig.files) == 0) {
warning("No files found in the directory")
return(list())
}

reader <- if (format == "json") {
function(x) as.data.frame(fromJSON(x))
} else if (format %in% c("10X", "WAT3R", "Omniscope")) {
read.csv
} else {
df <- lapply(contig.files, read.delim)
read.delim
}
#Already loaded list or data frame
} else if (inherits(x=input, what ="list") | inherits(x=input, what ="data.frame")) {
df <- .checkList(input)
}

loadFunc <- switch(format,
"10X" = .parse10x,
"AIRR" = .parseAIRR,
"Dandelion" = .parseDandelion,
"JSON" = .parseJSON,
"MiXCR" = .parseMiXCR,
"TRUST4" = .parseTRUST4,
"BD" = .parseBD,
"WAT3R" = .parseWAT3R,
"Omniscope" = .parseOmniscope,
"Immcantation" = .parseImmcantation,
"ParseBio" = .parseParse,
stop("Invalid format provided"))

df <- loadFunc(df)
return(df)

lapply(contig.files, reader)

} else { # handle an already loaded list of dfs / 1 df
.checkList(input)
}

loadFunc <- switch(format,
"10X" = .parse10x,
"AIRR" = .parseAIRR,
"Dandelion" = .parseDandelion,
"JSON" = .parseJSON,
"MiXCR" = .parseMiXCR,
"TRUST4" = .parseTRUST4,
"BD" = .parseBD,
"WAT3R" = .parseWAT3R,
"Omniscope" = .parseOmniscope,
"Immcantation" = .parseImmcantation,
"ParseBio" = .parseParse
)

loadFunc(rawDataDfList)
}

#Formats TRUST4 data
#' @importFrom stringr str_split
.parseTRUST4 <- function(df) {
for (i in seq_along(df)) {
colnames(df[[i]])[1] <- "barcode"
df[[i]][df[[i]] == "*"] <- NA

if(length(which(is.na(df[[i]]$chain1))) == length(df[[i]]$chain1)) {
chain2 <- matrix(ncol = 7, nrow = length(df[[i]]$chain1))
} else {
chain2 <- str_split(df[[i]]$chain1, ",", simplify = TRUE)[,seq_len(7)]
chain2[chain2 == "*"] <- "None"
}
colnames(chain2) <- c("v_gene", "d_gene", "j_gene", "c_gene", "cdr3_nt", "cdr3", "reads")
chain2 <- data.frame(barcode = df[[i]][,1], chain2)

if(length(which(is.na(df[[i]]$chain2))) == length(df[[i]]$chain2)) {
chain1 <- matrix(ncol = 7, nrow = length(df[[i]]$chain2))

processChain <- function(data, chain_col) {
if (all(is.na(data[[chain_col]]))) {
chain <- matrix(ncol = 7, nrow = length(data[[chain_col]]))
} else {
chain1 <- str_split(df[[i]]$chain2, ",", simplify = TRUE)[,seq_len(7)]
chain1[chain1 == "*"] <- "None"
chain <- str_split(data[[chain_col]], ",", simplify = TRUE)
chain <- chain[, seq_len(7), drop = FALSE]
chain[chain == "*"] <- "None"
}
colnames(chain1) <- c("v_gene", "d_gene", "j_gene", "c_gene", "cdr3_nt", "cdr3", "reads")
chain1 <- data.frame(barcode = df[[i]][,1], chain1)
data2 <- rbind(chain1, chain2)
data2[data2 == ""] <- NA
df[[i]] <- data2
colnames(chain) <- c(
"v_gene", "d_gene", "j_gene", "c_gene", "cdr3_nt", "cdr3", "reads"
)
data.frame(barcode = data$barcode, chain)
}
df <- .chain.parser(df)
return(df)

formattedDfs <- lapply(df, function(data) {

colnames(data)[1] <- "barcode"
data[data == "*"] <- NA

# not a mistake, opposite definitions in TRUST4 and scRepertoire
chain1 <- processChain(data, "chain2")
chain2 <- processChain(data, "chain1")

combined_data <- rbind(chain1, chain2)
combined_data[combined_data == ""] <- NA
combined_data
})
# is it necessary to drop rows that are fully NA with an existing barcode?
.chain.parser(formattedDfs)
}

#Grabs the chain info from v_gene
.chain.parser <- function(df) {
lapply(df, function(x) {
x$chain <- substr(x$v_gene, 1, 3)
x
})
}

#Formats wat3r data
Expand All @@ -129,14 +161,14 @@ loadContigs <- function(input,
chain2 <- df[[i]][,c("BC","TRBV","TRBD","TRBJ","TRB_CDR3nuc","TRB_CDR3","TRB_nReads","TRB_CDR3_UMIcount")]
chain2 <- data.frame(chain2[,1], chain = "TRB", chain2[,2:4], c_gene = NA, chain2[,5:8])
colnames(chain2) <- c("barcode", "chain", "v_gene", "d_gene", "j_gene", "c_gene", "cdr3_nt", "cdr3", "reads", "umis")

#TRA Chain 1
chain1 <- df[[i]][,c("BC","TRAV","TRAJ","TRA_CDR3nuc","TRA_CDR3","TRA_nReads","TRA_CDR3_UMIcount")]
chain1 <- data.frame(chain1[,1], chain = "TRA",chain1[,2], d_gene = NA, chain1[,3], c_gene = NA, chain1[,4:7])
colnames(chain1) <- c("barcode", "chain", "v_gene", "d_gene", "j_gene", "c_gene", "cdr3_nt", "cdr3", "reads", "umis")
data2 <- rbind(chain1, chain2)
data2[data2 == ""] <- NA

#TRA Chain 2
chain3 <- df[[i]][,c("BC","TRAV.2","TRAJ.2","TRA.2_CDR3nuc","TRA.2_CDR3","TRA.2_nReads","TRA.2_CDR3_UMIcount")]
chain3 <- data.frame(chain3[,1], chain = "TRA",chain3[,2], d_gene = NA, chain3[,3], c_gene = NA, chain3[,4:7])
Expand All @@ -145,7 +177,7 @@ loadContigs <- function(input,
data2[data2 == ""] <- NA
df[[i]] <- data2
df[[i]] <- df[[i]][with(df[[i]], order(reads, chain)),]

}
return(df)
}
Expand All @@ -167,7 +199,7 @@ loadContigs <- function(input,
df[[i]] <- subset(df[[i]], productive %in% c(TRUE, "TRUE", "True", "true"))
if (nrow(df[[i]]) == 0) { stop(
"There are 0 contigs after internal filtering -
check the contig list to see if any issues exist
check the contig list to see if any issues exist
for productive chains", call. = FALSE) }
df[[i]] <- subset(df[[i]], cdr3 != "None")
df[[i]][df[[i]] == ""] <- NA
Expand All @@ -184,14 +216,6 @@ loadContigs <- function(input,
}
return(df)
}
#Grabs the chain info from v_gene
.chain.parser <- function(df) {
for (i in seq_along(df)) {
df[[i]]$chain <- substr(df[[i]][,"v_gene"],1,3)
}
return(df)
}


.parseOmniscope <- function(df) {
for (i in seq_along(df)) {
Expand Down Expand Up @@ -259,20 +283,20 @@ loadContigs <- function(input,
colnames(TRA.2) <- 1:8
TRA <- rbind(TRA.1, TRA.2)
TRA$chain <- "TRA"

TRB.1 <- df[[i]][,c("Barcode", "TRB_V", "TRB_D", "TRB_J", "TRB_C", "TRB_cdr3_aa", "TRB_read_count", "TRB_transcript_count")]
TRB.2 <- df[[i]][,c("Barcode", "secondary_TRB_V", "secondary_TRB_D", "secondary_TRB_J", "secondary_TRB_C", "secondary_TRB_cdr3_aa", "secondary_TRB_read_count", "secondary_TRB_transcript_count")]
colnames(TRB.1) <- 1:8
colnames(TRB.2) <- 1:8
TRB <- rbind(TRB.1, TRB.2)
TRB$chain <- "TRB"

data2 <- rbind(TRA, TRB)
data2 <- data2[rowSums(is.na(data2[2:8])) != 7, ]
colnames(data2) <- c("barcode", "v_gene", "d_gene", "j_gene", "c_gene", "cdr3", "reads", "umis", "chain")
data2$cdr3_nt <- NA
data2 <- data2[,c("barcode", "chain", "v_gene", "d_gene", "j_gene", "c_gene", "cdr3_nt", "cdr3", "reads", "umis")]

df[[i]] <- data2
df[[i]] <- df[[i]][with(df[[i]], order(reads, chain)),]
}
Expand All @@ -283,6 +307,6 @@ loadContigs <- function(input,
for (i in seq_along(df)) {
df[[i]] <- df[[i]][,c("cell_id", "locus", "consensus_count", "v_call", "d_call", "j_call", "c_call", "cdr3", "cdr3_aa", "productive")]
colnames(df[[i]]) <- c("barcode", "chain", "reads", "v_gene", "d_gene", "j_gene", "c_gene", "cdr3_nt", "cdr3", "productive")
}
}
return(df)
}
Loading

0 comments on commit 4845b7c

Please sign in to comment.