Merge pull request #430 from Qile0317/429

Fix TRUST4 loadContigs issue in #429
ncborcherding · Oct 28, 2024 · 4845b7c · 4845b7c
2 parents 18a2d41 + c18c1d0
commit 4845b7c
Show file tree

Hide file tree

Showing 11 changed files with 214 additions and 185 deletions.
diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml
@@ -4,7 +4,7 @@ on:
   push:
     branches: [main, master, v2]
   pull_request:
-    branches: [main, master, v2]
+    branches: [main, master, v2, dev]
 
 name: R-CMD-check
 

diff --git a/.gitignore b/.gitignore
@@ -7,4 +7,6 @@ local_tests.R
 docs
 vignettes/articles/scRep_example_full.rds
 .vscode
-qile
+qile
+dev
+.lintr
diff --git a/NEWS.md b/NEWS.md
@@ -5,6 +5,7 @@
 * Removed unnecessary code remnant in ```clonalLength()```
 * Allow one sample to be plotted by ```percentVJ()```
 * Fixed issue with ```positionalProperty()``` and exportTable
+* Fixed issue with ```loadContigs()``` edgecase when TRUST4 data only has 1 row.
 
 # scRepertoire VERSION 2.0.7
 

diff --git a/R/loadContigs.R b/R/loadContigs.R
@@ -1,124 +1,156 @@
 #' Loading the contigs derived from single-cell sequencing
 #'
-#' This function generates a contig list and formats the data to allow for 
-#' function with  \code{\link{combineTCR}} or \code{\link{combineBCR}}. If 
-#' using data derived from filtered outputs of 10X Genomics, there is no 
-#' need to use this function as the data is already compatible. 
-#' 
-#' The files that this function parses includes:  
+#' This function generates a contig list and formats the data to allow for
+#' function with  \code{\link{combineTCR}} or \code{\link{combineBCR}}. If
+#' using data derived from filtered outputs of 10X Genomics, there is no
+#' need to use this function as the data is already compatible.
+#'
+#' The files that this function parses includes:
 #' \itemize{
 #'   \item 10X =  "filtered_contig_annotations.csv"
-#'   \item AIRR = "airr_rearrangement.tsv" 
-#'   \item BD = "Contigs_AIRR.tsv" 
-#'   \item Dandelion = "all_contig_dandelion.tsv" 
-#'   \item Immcantation = "data.tsv" 
+#'   \item AIRR = "airr_rearrangement.tsv"
+#'   \item BD = "Contigs_AIRR.tsv"
+#'   \item Dandelion = "all_contig_dandelion.tsv"
+#'   \item Immcantation = "data.tsv"
 #'   \item JSON = ".json"
 #'   \item ParseBio = "barcode_report.tsv"
 #'   \item MiXCR = "clones.tsv"
-#'   \item Omniscope = ".csv" 
+#'   \item Omniscope = ".csv"
 #'   \item TRUST4 = "barcode_report.tsv"
-#'   \item WAT3R = "barcode_results.csv" 
+#'   \item WAT3R = "barcode_results.csv"
 #' }
-#' 
+#'
 #' @examples
 #' TRUST4 <- read.csv("https://www.borch.dev/uploads/contigs/TRUST4_contigs.csv")
 #' contig.list <- loadContigs(TRUST4, format = "TRUST4")
-#' 
+#'
 #' BD <- read.csv("https://www.borch.dev/uploads/contigs/BD_contigs.csv")
 #' contig.list <- loadContigs(BD, format = "BD")
-#' 
+#'
 #' WAT3R <- read.csv("https://www.borch.dev/uploads/contigs/WAT3R_contigs.csv")
 #' contig.list <- loadContigs(WAT3R, format = "WAT3R")
-#' 
-#' @param input The directory in which contigs are located or a list with contig elements
-#' @param format The format of the single-cell contig, currently supporting: 
-#' "10X", "AIRR", "BD", "Dandelion", "JSON", "MiXCR", "ParseBio", "Omniscope", "TRUST4", and "WAT3R"
+#'
+#' @param input The directory in which contigs are located or a list with contig
+#' elements
+#' @param format The format of the single-cell contig, currently supporting:
+#' "10X", "AIRR", "BD", "Dandelion", "JSON", "MiXCR", "ParseBio", "Omniscope",
+#' "TRUST4", "WAT3R", and "Immcantation"
 #' @importFrom utils read.csv read.delim
 #' @importFrom rjson fromJSON
 #' @export
 #' @concept Loading_and_Processing_Contigs
-#' @return List of contigs for compatibility  with \code{\link{combineTCR}} or 
+#' @return List of contigs for compatibility  with \code{\link{combineTCR}} or
 #' \code{\link{combineBCR}}
-loadContigs <- function(input, 
-                        format = "10X") {
-  #Loading from directory, recursively
-  if (inherits(x=input, what ="character")) {
-    format.list <- list("WAT3R" = "barcode_results.csv", 
-                        "10X" =  "filtered_contig_annotations.csv", 
-                        "AIRR" = "airr_rearrangement.tsv", 
-                        "Dandelion" = "all_contig_dandelion.tsv",
-                        "Immcantation" = "_data.tsv",
-                        "MiXCR" = "clones.tsv", 
-                        "JSON" = ".json",
-                        "TRUST4" = "barcode_report.tsv", 
-                        "BD" = "Contigs_AIRR.tsv",
-                        "Omniscope" =c("_OSB.csv", "_OST.csv"),
-                        "ParseBio" = "barcode_report.tsv")
+loadContigs <- function(input, format = "10X") {
+
+    assert_that(is.string(input) || is.list(input) || is.data.frame(input))
+    assert_that(is.string(format))
+    assert_that(format %in% c(
+        "10X", "AIRR", "BD", "Dandelion", "JSON", "MiXCR", "ParseBio",
+        "Omniscope", "TRUST4", "WAT3R", "Immcantation"
+    ))
+
+    #Loading from directory, recursively
+    rawDataDfList <- if (inherits(x = input, what = "character")) {
+
+        format.list <- list(
+            "WAT3R" = "barcode_results.csv",
+            "10X" =  "filtered_contig_annotations.csv",
+            "AIRR" = "airr_rearrangement.tsv",
+            "Dandelion" = "all_contig_dandelion.tsv",
+            "Immcantation" = "_data.tsv",
+            "MiXCR" = "clones.tsv",
+            "JSON" = ".json",
+            "TRUST4" = "barcode_report.tsv",
+            "BD" = "Contigs_AIRR.tsv",
+            "Omniscope" = c("_OSB.csv", "_OST.csv"),
+            "ParseBio" = "barcode_report.tsv"
+        )
         file.pattern <- format.list[[format]]
-        contig.files <- list.files(input, paste0(file.pattern, collapse = "|"), recursive = TRUE, full.names = TRUE)
-
-        if (format %in% c("10X", "WAT3R", "Omniscope")) {
-          df <- lapply(contig.files, read.csv) 
-        } else if(format %in% c("json")) { 
-          df <- lapply(contig.files, function(x) {
-            tmp <- as.data.frame(fromJSON(x))
-          })
+        contig.files <- list.files(
+            input,
+            paste0("*", file.pattern, "$", collapse = "|"),
+            recursive = TRUE,
+            full.names = TRUE
+        )
+
+        if (length(contig.files) == 0) {
+            warning("No files found in the directory")
+            return(list())
+        }
+
+        reader <- if (format == "json") {
+            function(x) as.data.frame(fromJSON(x))
+        } else if (format %in% c("10X", "WAT3R", "Omniscope")) {
+            read.csv
         } else {
-          df <- lapply(contig.files, read.delim)
+            read.delim
         }
-  #Already loaded list or data frame
-  } else if (inherits(x=input, what ="list") | inherits(x=input, what ="data.frame")) {
-    df <- .checkList(input)
-  }
-
-  loadFunc <- switch(format,
-                     "10X" = .parse10x,
-                     "AIRR" = .parseAIRR,
-                     "Dandelion" = .parseDandelion,
-                     "JSON" = .parseJSON,
-                     "MiXCR" = .parseMiXCR,
-                     "TRUST4" = .parseTRUST4,
-                     "BD" = .parseBD,
-                     "WAT3R"  = .parseWAT3R,
-                     "Omniscope" = .parseOmniscope,
-                     "Immcantation" = .parseImmcantation,
-                     "ParseBio" = .parseParse,
-                      stop("Invalid format provided"))
-
-  df <- loadFunc(df)
-  return(df)
+
+        lapply(contig.files, reader)
+
+    } else { # handle an already loaded list of dfs / 1 df
+        .checkList(input)
+    }
+
+    loadFunc <- switch(format,
+        "10X" = .parse10x,
+        "AIRR" = .parseAIRR,
+        "Dandelion" = .parseDandelion,
+        "JSON" = .parseJSON,
+        "MiXCR" = .parseMiXCR,
+        "TRUST4" = .parseTRUST4,
+        "BD" = .parseBD,
+        "WAT3R"  = .parseWAT3R,
+        "Omniscope" = .parseOmniscope,
+        "Immcantation" = .parseImmcantation,
+        "ParseBio" = .parseParse
+    )
+
+    loadFunc(rawDataDfList)
 }
 
 #Formats TRUST4 data
 #' @importFrom stringr str_split
 .parseTRUST4 <- function(df) {
-    for (i in seq_along(df)) {
-        colnames(df[[i]])[1] <- "barcode"
-        df[[i]][df[[i]] == "*"] <- NA
-
-        if(length(which(is.na(df[[i]]$chain1))) == length(df[[i]]$chain1)) {
-          chain2 <- matrix(ncol = 7, nrow = length(df[[i]]$chain1))
-        } else {
-          chain2 <- str_split(df[[i]]$chain1, ",", simplify = TRUE)[,seq_len(7)]
-          chain2[chain2 == "*"] <- "None"
-        }
-        colnames(chain2) <- c("v_gene", "d_gene", "j_gene", "c_gene", "cdr3_nt", "cdr3", "reads")
-        chain2 <- data.frame(barcode = df[[i]][,1], chain2)
-
-        if(length(which(is.na(df[[i]]$chain2))) == length(df[[i]]$chain2)) {
-          chain1 <- matrix(ncol = 7, nrow = length(df[[i]]$chain2))
+
+    processChain <- function(data, chain_col) {
+        if (all(is.na(data[[chain_col]]))) {
+            chain <- matrix(ncol = 7, nrow = length(data[[chain_col]]))
         } else {
-          chain1 <- str_split(df[[i]]$chain2, ",", simplify = TRUE)[,seq_len(7)]
-          chain1[chain1 == "*"] <- "None"
+            chain <- str_split(data[[chain_col]], ",", simplify = TRUE)
+            chain <- chain[, seq_len(7), drop = FALSE]
+            chain[chain == "*"] <- "None"
         }
-        colnames(chain1) <- c("v_gene", "d_gene", "j_gene", "c_gene", "cdr3_nt", "cdr3", "reads")
-        chain1 <- data.frame(barcode = df[[i]][,1], chain1)
-        data2 <- rbind(chain1, chain2)
-        data2[data2 == ""] <- NA
-        df[[i]] <- data2
+        colnames(chain) <- c(
+            "v_gene", "d_gene", "j_gene", "c_gene", "cdr3_nt", "cdr3", "reads"
+        )
+        data.frame(barcode = data$barcode, chain)
     }
-    df <- .chain.parser(df)
-    return(df)
+
+    formattedDfs <- lapply(df, function(data) {
+
+        colnames(data)[1] <- "barcode"
+        data[data == "*"] <- NA
+
+        # not a mistake, opposite definitions in TRUST4 and scRepertoire
+        chain1 <- processChain(data, "chain2")
+        chain2 <- processChain(data, "chain1")
+
+        combined_data <- rbind(chain1, chain2)
+        combined_data[combined_data == ""] <- NA
+        combined_data
+    })
+    # is it necessary to drop rows that are fully NA with an existing barcode?
+    .chain.parser(formattedDfs)
+}
+
+#Grabs the chain info from v_gene
+.chain.parser <- function(df) {
+    lapply(df, function(x) {
+        x$chain <- substr(x$v_gene, 1, 3)
+        x
+    })
 }
 
 #Formats wat3r data
@@ -129,14 +161,14 @@ loadContigs <- function(input,
         chain2 <- df[[i]][,c("BC","TRBV","TRBD","TRBJ","TRB_CDR3nuc","TRB_CDR3","TRB_nReads","TRB_CDR3_UMIcount")]
         chain2 <- data.frame(chain2[,1], chain = "TRB", chain2[,2:4], c_gene = NA, chain2[,5:8])
         colnames(chain2) <- c("barcode", "chain", "v_gene", "d_gene", "j_gene", "c_gene", "cdr3_nt", "cdr3", "reads", "umis")
-        
+
         #TRA Chain 1
         chain1 <-  df[[i]][,c("BC","TRAV","TRAJ","TRA_CDR3nuc","TRA_CDR3","TRA_nReads","TRA_CDR3_UMIcount")]
         chain1 <- data.frame(chain1[,1], chain = "TRA",chain1[,2], d_gene = NA, chain1[,3], c_gene = NA, chain1[,4:7])
         colnames(chain1) <- c("barcode", "chain", "v_gene", "d_gene", "j_gene", "c_gene", "cdr3_nt", "cdr3", "reads", "umis")
         data2 <- rbind(chain1, chain2)
         data2[data2 == ""] <- NA
-        
+
         #TRA Chain 2
         chain3 <-  df[[i]][,c("BC","TRAV.2","TRAJ.2","TRA.2_CDR3nuc","TRA.2_CDR3","TRA.2_nReads","TRA.2_CDR3_UMIcount")]
         chain3 <- data.frame(chain3[,1], chain = "TRA",chain3[,2],  d_gene = NA, chain3[,3], c_gene = NA, chain3[,4:7])
@@ -145,7 +177,7 @@ loadContigs <- function(input,
         data2[data2 == ""] <- NA
         df[[i]] <- data2
         df[[i]] <- df[[i]][with(df[[i]], order(reads, chain)),]
-        
+
     }
     return(df)
 }
@@ -167,7 +199,7 @@ loadContigs <- function(input,
         df[[i]] <- subset(df[[i]], productive %in% c(TRUE, "TRUE", "True", "true"))
         if (nrow(df[[i]]) == 0) { stop(
             "There are 0 contigs after internal filtering -
-            check the contig list to see if any issues exist 
+            check the contig list to see if any issues exist
             for productive chains", call. = FALSE) }
         df[[i]] <- subset(df[[i]], cdr3 != "None")
         df[[i]][df[[i]] == ""] <- NA
@@ -184,14 +216,6 @@ loadContigs <- function(input,
   }
   return(df)
 }
-#Grabs the chain info from v_gene
-.chain.parser <- function(df) {
-  for (i in seq_along(df)) {
-    df[[i]]$chain <- substr(df[[i]][,"v_gene"],1,3)
-  }
-  return(df)
-}
-
 
 .parseOmniscope <- function(df) {
   for (i in seq_along(df)) {
@@ -259,20 +283,20 @@ loadContigs <- function(input,
     colnames(TRA.2) <- 1:8
     TRA <- rbind(TRA.1, TRA.2)
     TRA$chain <- "TRA"
-    
+
     TRB.1 <- df[[i]][,c("Barcode", "TRB_V", "TRB_D", "TRB_J", "TRB_C", "TRB_cdr3_aa", "TRB_read_count", "TRB_transcript_count")]
     TRB.2 <- df[[i]][,c("Barcode", "secondary_TRB_V", "secondary_TRB_D", "secondary_TRB_J", "secondary_TRB_C", "secondary_TRB_cdr3_aa", "secondary_TRB_read_count", "secondary_TRB_transcript_count")]
     colnames(TRB.1) <- 1:8
     colnames(TRB.2) <- 1:8
     TRB <- rbind(TRB.1, TRB.2)
     TRB$chain <- "TRB"
-    
+
     data2 <- rbind(TRA, TRB)
     data2 <- data2[rowSums(is.na(data2[2:8])) != 7, ]
     colnames(data2) <- c("barcode", "v_gene", "d_gene", "j_gene", "c_gene", "cdr3", "reads", "umis", "chain")
     data2$cdr3_nt <- NA
     data2 <- data2[,c("barcode", "chain", "v_gene", "d_gene", "j_gene", "c_gene", "cdr3_nt", "cdr3", "reads", "umis")]
-    
+
     df[[i]] <- data2
     df[[i]] <- df[[i]][with(df[[i]], order(reads, chain)),]
   }
@@ -283,6 +307,6 @@ loadContigs <- function(input,
   for (i in seq_along(df)) {
     df[[i]] <- df[[i]][,c("cell_id", "locus", "consensus_count", "v_call", "d_call", "j_call", "c_call", "cdr3", "cdr3_aa", "productive")]
     colnames(df[[i]]) <- c("barcode", "chain", "reads", "v_gene", "d_gene", "j_gene", "c_gene", "cdr3_nt", "cdr3", "productive")
-  } 
+  }
   return(df)
 }
-Original file line number
+Diff line change
@@ Expand Up / @@ -7,4 +7,6 @@ local_tests.R @@
     docs
     vignettes/articles/scRep_example_full.rds
     .vscode
-    qile
+    qile
+    dev
+    .lintr