drop all NA rows from loadContigs output

ncborcherding · Nov 8, 2024 · 19cabb6 · 19cabb6
1 parent bda1cb5
commit 19cabb6
Show file tree

Hide file tree

Showing 4 changed files with 106 additions and 51 deletions.
diff --git a/R/loadContigs.R b/R/loadContigs.R
@@ -1,24 +1,24 @@
 #' Loading the contigs derived from single-cell sequencing
 #'
+#' @description
 #' This function generates a contig list and formats the data to allow for
 #' function with  [combineTCR()] or [combineBCR()]. If
 #' using data derived from filtered outputs of 10X Genomics, there is no
 #' need to use this function as the data is already compatible.
 #'
 #' The files that this function parses includes:
-#' \itemize{
-#'   \item 10X =  "filtered_contig_annotations.csv"
-#'   \item AIRR = "airr_rearrangement.tsv"
-#'   \item BD = "Contigs_AIRR.tsv"
-#'   \item Dandelion = "all_contig_dandelion.tsv"
-#'   \item Immcantation = "data.tsv"
-#'   \item JSON = ".json"
-#'   \item ParseBio = "barcode_report.tsv"
-#'   \item MiXCR = "clones.tsv"
-#'   \item Omniscope = ".csv"
-#'   \item TRUST4 = "barcode_report.tsv"
-#'   \item WAT3R = "barcode_results.csv"
-#' }
+#'
+#' - **10X**: `"filtered_contig_annotations.csv"`
+#' - **AIRR**: `"airr_rearrangement.tsv"`
+#' - **BD**: `"Contigs_AIRR.tsv"`
+#' - **Dandelion**: `"all_contig_dandelion.tsv"`
+#' - **Immcantation**: `"data.tsv"`
+#' - **JSON**: `".json"`
+#' - **ParseBio**: `"barcode_report.tsv"`
+#' - **MiXCR**: `"clones.tsv"`
+#' - **Omniscope**: `".csv"`
+#' - **TRUST4**: `"barcode_report.tsv"`
+#' - **WAT3R**: `"barcode_results.csv"`
 #'
 #' @examples
 #' TRUST4 <- read.csv("https://www.borch.dev/uploads/contigs/TRUST4_contigs.csv")
@@ -40,15 +40,19 @@
 #' @export
 #' @concept Loading_and_Processing_Contigs
 #' @return List of contigs for compatibility  with [combineTCR()] or
-#' [combineBCR()]
+#' [combineBCR()]. Note that rows which are fully NA are dropped from the
+#' final output.
+#'
 loadContigs <- function(input, format = "10X") {
 
-    assert_that(is.string(input) || is.list(input) || is.data.frame(input))
-    assert_that(is.string(format))
-    assert_that(format %in% c(
-        "10X", "AIRR", "BD", "Dandelion", "JSON", "MiXCR", "ParseBio",
-        "Omniscope", "TRUST4", "WAT3R", "Immcantation"
-    ))
+    assert_that(
+      is.string(input) || is.list(input) || is.data.frame(input),
+      is.string(format),
+      isIn(format, c(
+          "10X", "AIRR", "BD", "Dandelion", "JSON", "MiXCR", "ParseBio",
+          "Omniscope", "TRUST4", "WAT3R", "Immcantation"
+      ))
+    )
 
     #Loading from directory, recursively
     rawDataDfList <- if (inherits(x = input, what = "character")) {
@@ -107,7 +111,15 @@ loadContigs <- function(input, format = "10X") {
         "ParseBio" = .parseParse
     )
 
-    loadFunc(rawDataDfList)
+    rmAllNaRowsFromLoadContigs(loadFunc(rawDataDfList))
+}
+
+rmAllNaRowsFromLoadContigs <- function(dfList) {
+    cols <- colnames(dfList[[1]])
+    cols <- cols[cols != "barcode"]
+    lapply(dfList, function(x) {
+        x[rowSums(!is.na(x[cols])) > 0, ]
+    })
 }
 
 #Formats TRUST4 data

diff --git a/R/typecheck.R b/R/typecheck.R
@@ -38,6 +38,10 @@ assertthat::on_failure(is_named_numeric) <- function(call, env) {
 
 # functions
 
-assertthat::on_failure(`%in%`) <- function(call, env) {
+isIn <- function(x, table) {
+    x %in% table
+}
+
+assertthat::on_failure(isIn) <- function(call, env) {
     paste0(deparse(call$x), " is not in ", deparse(call$table))
 }
diff --git a/man/loadContigs.Rd b/man/loadContigs.Rd
diff --git a/tests/testthat/test-loadContigs.R b/tests/testthat/test-loadContigs.R
@@ -1,62 +1,101 @@
 # test script for loadContigs.R - testcases are NOT comprehensive!
 
 test_that("loadContigs works", {
-    TRUST4 <- read.csv("https://www.borch.dev/uploads/contigs/TRUST4_contigs.csv")
-    trial1 <- loadContigs(TRUST4, format = "TRUST4")
-    expect_identical(trial1, 
-                     getdata("load", "loadContigs_TRUST4")
-    )
-
 
     BD <- read.csv("https://www.borch.dev/uploads/contigs/BD_contigs.csv")
     trial2 <- loadContigs(BD, format = "BD")
     expect_identical(trial2, 
-                     getdata("load", "loadContigs_BD")
+                     rmAllNaRowsFromLoadContigs(getdata("load", "loadContigs_BD"))
     )
 
     WAT3R <- read.csv("https://www.borch.dev/uploads/contigs/WAT3R_contigs.csv")
     trial3 <- loadContigs(WAT3R, format = "WAT3R")
     expect_identical(trial3, 
-                     getdata("load", "loadContigs_WAT3R")
+                     rmAllNaRowsFromLoadContigs(getdata("load", "loadContigs_WAT3R"))
     )
 
     data("contig_list")
     trial4 <- loadContigs(contig_list[[1]], format = "10X")
     expect_identical(trial4, 
-                     getdata("load", "loadContigs_10x")
+                     rmAllNaRowsFromLoadContigs(getdata("load", "loadContigs_10x"))
     )
 
 
     MIXCR <- read.csv("https://www.borch.dev/uploads/contigs/MIXCR_contigs.csv")
     trial5 <- loadContigs(MIXCR, format = "MiXCR")
     expect_identical(trial5, 
-                     getdata("load", "loadContigs_MiXCR")
+                     rmAllNaRowsFromLoadContigs(getdata("load", "loadContigs_MiXCR"))
     )
 
     Immcantation <- read.csv("https://www.borch.dev/uploads/contigs/Immcantation_contigs.csv")
     trial6 <- loadContigs(Immcantation, format = "Immcantation")
     expect_identical(trial6, 
-                     getdata("load", "loadContigs_Immcantation")
+                     rmAllNaRowsFromLoadContigs(getdata("load", "loadContigs_Immcantation"))
     )
 
     OS <- read.csv("https://www.borch.dev/uploads/contigs/OS_contigs2.csv")
     trial7 <- loadContigs(OS, format = "Omniscope")
     expect_identical(trial7, 
-                     getdata("load", "loadContigs_Omniscope")
+                     rmAllNaRowsFromLoadContigs(getdata("load", "loadContigs_Omniscope"))
     )
 
     Parse <- read.csv("https://www.borch.dev/uploads/contigs/Parse_contigs.csv")
     trial8 <- loadContigs(Parse, format = "ParseBio")
     expect_identical(trial8, 
-                     getdata("load", "loadContigs_Parse")
+                     rmAllNaRowsFromLoadContigs(getdata("load", "loadContigs_Parse"))
     )
 
     Dandelion <- read.csv("https://www.borch.dev/uploads/contigs/Dandelion_contigs.csv")
     trial9 <- loadContigs(Dandelion, format = "Dandelion")
     expect_identical(trial9, 
-                     getdata("load", "loadContigs_Dandelion")
+                     rmAllNaRowsFromLoadContigs(getdata("load", "loadContigs_Dandelion"))
+    )
+})
+
+test_that("loadContigs(format='TRUST4') works", {
+
+    TRUST4 <- read.csv("https://www.borch.dev/uploads/contigs/TRUST4_contigs.csv")
+    expect_identical(
+        loadContigs(TRUST4, format = "TRUST4"), 
+        rmAllNaRowsFromLoadContigs(getdata("load", "loadContigs_TRUST4"))
+    )
+
+    oneRowTrust4Input <- structure(
+        list(
+            `#barcode` = "CGTAGCGGTGATAAGT-1",
+            cell_type = "B",
+            chain1 = "*",
+            chain2 = "IGKV1D-43,*,IGKJ1,IGKC,TGTCAACAGTATAGTAGTGTCCCCTGGACGTTC,CQQYSSVPWTF,6.00,CGTAGCGGTGATAAGT-1_2,76.00,0",
+            secondary_chain1 = "*",
+            secondary_chain2 = "*"
+        ),
+        row.names = c(NA, -1L),
+        class = "data.frame"
+    )
+
+    expectedParsedTrust4Data <- list(
+        structure(
+            list(
+                barcode = "CGTAGCGGTGATAAGT-1",
+                v_gene = "IGKV1D-43",
+                d_gene = "None",
+                j_gene = "IGKJ1",
+                c_gene = "IGKC",
+                cdr3_nt = "TGTCAACAGTATAGTAGTGTCCCCTGGACGTTC",
+                cdr3 = "CQQYSSVPWTF",
+                reads = "6.00",
+                chain = "IGK"
+            ),
+            row.names = 1L,
+            class = "data.frame"
+        )
+    )
+
+    expect_identical(
+        loadContigs(oneRowTrust4Input, format = "TRUST4"),
+        expectedParsedTrust4Data
     )
-}) 
+})
 
 # TODO Add tests for .json and AIRR
 # TODO Would be nice to have a dir option