From 57a2f6b3fde7ea591afcea248781318f5de30a4a Mon Sep 17 00:00:00 2001 From: Qile0317 Date: Sun, 27 Oct 2024 16:13:57 -0700 Subject: [PATCH 01/11] add to .gitignore --- .gitignore | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 7163266c..2fc51a59 100644 --- a/.gitignore +++ b/.gitignore @@ -7,4 +7,6 @@ local_tests.R docs vignettes/articles/scRep_example_full.rds .vscode -qile \ No newline at end of file +qile +dev +.lintr \ No newline at end of file From 69325adeb2b2d0c39f89c5bc58e2fd5863d35819 Mon Sep 17 00:00:00 2001 From: Qile0317 Date: Sun, 27 Oct 2024 16:19:13 -0700 Subject: [PATCH 02/11] rm trailing whitespace in loadContigs --- R/loadContigs.R | 80 +++++++++++++++++------------------ man/clonalAbundance.Rd | 32 +++++++------- man/clonalSizeDistribution.Rd | 24 +++++------ man/percentAA.Rd | 10 ++--- man/positionalEntropy.Rd | 22 +++++----- man/positionalProperty.Rd | 16 +++---- man/vizGenes.Rd | 24 +++++------ 7 files changed, 104 insertions(+), 104 deletions(-) diff --git a/R/loadContigs.R b/R/loadContigs.R index 469c9f3f..06119d87 100644 --- a/R/loadContigs.R +++ b/R/loadContigs.R @@ -1,65 +1,65 @@ #' Loading the contigs derived from single-cell sequencing #' -#' This function generates a contig list and formats the data to allow for -#' function with \code{\link{combineTCR}} or \code{\link{combineBCR}}. If -#' using data derived from filtered outputs of 10X Genomics, there is no -#' need to use this function as the data is already compatible. -#' -#' The files that this function parses includes: +#' This function generates a contig list and formats the data to allow for +#' function with \code{\link{combineTCR}} or \code{\link{combineBCR}}. If +#' using data derived from filtered outputs of 10X Genomics, there is no +#' need to use this function as the data is already compatible. +#' +#' The files that this function parses includes: #' \itemize{ #' \item 10X = "filtered_contig_annotations.csv" -#' \item AIRR = "airr_rearrangement.tsv" -#' \item BD = "Contigs_AIRR.tsv" -#' \item Dandelion = "all_contig_dandelion.tsv" -#' \item Immcantation = "data.tsv" +#' \item AIRR = "airr_rearrangement.tsv" +#' \item BD = "Contigs_AIRR.tsv" +#' \item Dandelion = "all_contig_dandelion.tsv" +#' \item Immcantation = "data.tsv" #' \item JSON = ".json" #' \item ParseBio = "barcode_report.tsv" #' \item MiXCR = "clones.tsv" -#' \item Omniscope = ".csv" +#' \item Omniscope = ".csv" #' \item TRUST4 = "barcode_report.tsv" -#' \item WAT3R = "barcode_results.csv" +#' \item WAT3R = "barcode_results.csv" #' } -#' +#' #' @examples #' TRUST4 <- read.csv("https://www.borch.dev/uploads/contigs/TRUST4_contigs.csv") #' contig.list <- loadContigs(TRUST4, format = "TRUST4") -#' +#' #' BD <- read.csv("https://www.borch.dev/uploads/contigs/BD_contigs.csv") #' contig.list <- loadContigs(BD, format = "BD") -#' +#' #' WAT3R <- read.csv("https://www.borch.dev/uploads/contigs/WAT3R_contigs.csv") #' contig.list <- loadContigs(WAT3R, format = "WAT3R") -#' +#' #' @param input The directory in which contigs are located or a list with contig elements -#' @param format The format of the single-cell contig, currently supporting: +#' @param format The format of the single-cell contig, currently supporting: #' "10X", "AIRR", "BD", "Dandelion", "JSON", "MiXCR", "ParseBio", "Omniscope", "TRUST4", and "WAT3R" #' @importFrom utils read.csv read.delim #' @importFrom rjson fromJSON #' @export #' @concept Loading_and_Processing_Contigs -#' @return List of contigs for compatibility with \code{\link{combineTCR}} or +#' @return List of contigs for compatibility with \code{\link{combineTCR}} or #' \code{\link{combineBCR}} -loadContigs <- function(input, +loadContigs <- function(input, format = "10X") { #Loading from directory, recursively if (inherits(x=input, what ="character")) { - format.list <- list("WAT3R" = "barcode_results.csv", - "10X" = "filtered_contig_annotations.csv", - "AIRR" = "airr_rearrangement.tsv", + format.list <- list("WAT3R" = "barcode_results.csv", + "10X" = "filtered_contig_annotations.csv", + "AIRR" = "airr_rearrangement.tsv", "Dandelion" = "all_contig_dandelion.tsv", "Immcantation" = "_data.tsv", - "MiXCR" = "clones.tsv", + "MiXCR" = "clones.tsv", "JSON" = ".json", - "TRUST4" = "barcode_report.tsv", + "TRUST4" = "barcode_report.tsv", "BD" = "Contigs_AIRR.tsv", "Omniscope" =c("_OSB.csv", "_OST.csv"), "ParseBio" = "barcode_report.tsv") file.pattern <- format.list[[format]] contig.files <- list.files(input, paste0(file.pattern, collapse = "|"), recursive = TRUE, full.names = TRUE) - + if (format %in% c("10X", "WAT3R", "Omniscope")) { - df <- lapply(contig.files, read.csv) - } else if(format %in% c("json")) { + df <- lapply(contig.files, read.csv) + } else if(format %in% c("json")) { df <- lapply(contig.files, function(x) { tmp <- as.data.frame(fromJSON(x)) }) @@ -70,7 +70,7 @@ loadContigs <- function(input, } else if (inherits(x=input, what ="list") | inherits(x=input, what ="data.frame")) { df <- .checkList(input) } - + loadFunc <- switch(format, "10X" = .parse10x, "AIRR" = .parseAIRR, @@ -84,7 +84,7 @@ loadContigs <- function(input, "Immcantation" = .parseImmcantation, "ParseBio" = .parseParse, stop("Invalid format provided")) - + df <- loadFunc(df) return(df) } @@ -95,7 +95,7 @@ loadContigs <- function(input, for (i in seq_along(df)) { colnames(df[[i]])[1] <- "barcode" df[[i]][df[[i]] == "*"] <- NA - + if(length(which(is.na(df[[i]]$chain1))) == length(df[[i]]$chain1)) { chain2 <- matrix(ncol = 7, nrow = length(df[[i]]$chain1)) } else { @@ -104,7 +104,7 @@ loadContigs <- function(input, } colnames(chain2) <- c("v_gene", "d_gene", "j_gene", "c_gene", "cdr3_nt", "cdr3", "reads") chain2 <- data.frame(barcode = df[[i]][,1], chain2) - + if(length(which(is.na(df[[i]]$chain2))) == length(df[[i]]$chain2)) { chain1 <- matrix(ncol = 7, nrow = length(df[[i]]$chain2)) } else { @@ -129,14 +129,14 @@ loadContigs <- function(input, chain2 <- df[[i]][,c("BC","TRBV","TRBD","TRBJ","TRB_CDR3nuc","TRB_CDR3","TRB_nReads","TRB_CDR3_UMIcount")] chain2 <- data.frame(chain2[,1], chain = "TRB", chain2[,2:4], c_gene = NA, chain2[,5:8]) colnames(chain2) <- c("barcode", "chain", "v_gene", "d_gene", "j_gene", "c_gene", "cdr3_nt", "cdr3", "reads", "umis") - + #TRA Chain 1 chain1 <- df[[i]][,c("BC","TRAV","TRAJ","TRA_CDR3nuc","TRA_CDR3","TRA_nReads","TRA_CDR3_UMIcount")] chain1 <- data.frame(chain1[,1], chain = "TRA",chain1[,2], d_gene = NA, chain1[,3], c_gene = NA, chain1[,4:7]) colnames(chain1) <- c("barcode", "chain", "v_gene", "d_gene", "j_gene", "c_gene", "cdr3_nt", "cdr3", "reads", "umis") data2 <- rbind(chain1, chain2) data2[data2 == ""] <- NA - + #TRA Chain 2 chain3 <- df[[i]][,c("BC","TRAV.2","TRAJ.2","TRA.2_CDR3nuc","TRA.2_CDR3","TRA.2_nReads","TRA.2_CDR3_UMIcount")] chain3 <- data.frame(chain3[,1], chain = "TRA",chain3[,2], d_gene = NA, chain3[,3], c_gene = NA, chain3[,4:7]) @@ -145,7 +145,7 @@ loadContigs <- function(input, data2[data2 == ""] <- NA df[[i]] <- data2 df[[i]] <- df[[i]][with(df[[i]], order(reads, chain)),] - + } return(df) } @@ -167,7 +167,7 @@ loadContigs <- function(input, df[[i]] <- subset(df[[i]], productive %in% c(TRUE, "TRUE", "True", "true")) if (nrow(df[[i]]) == 0) { stop( "There are 0 contigs after internal filtering - - check the contig list to see if any issues exist + check the contig list to see if any issues exist for productive chains", call. = FALSE) } df[[i]] <- subset(df[[i]], cdr3 != "None") df[[i]][df[[i]] == ""] <- NA @@ -191,7 +191,7 @@ loadContigs <- function(input, } return(df) } - + .parseOmniscope <- function(df) { for (i in seq_along(df)) { @@ -259,20 +259,20 @@ loadContigs <- function(input, colnames(TRA.2) <- 1:8 TRA <- rbind(TRA.1, TRA.2) TRA$chain <- "TRA" - + TRB.1 <- df[[i]][,c("Barcode", "TRB_V", "TRB_D", "TRB_J", "TRB_C", "TRB_cdr3_aa", "TRB_read_count", "TRB_transcript_count")] TRB.2 <- df[[i]][,c("Barcode", "secondary_TRB_V", "secondary_TRB_D", "secondary_TRB_J", "secondary_TRB_C", "secondary_TRB_cdr3_aa", "secondary_TRB_read_count", "secondary_TRB_transcript_count")] colnames(TRB.1) <- 1:8 colnames(TRB.2) <- 1:8 TRB <- rbind(TRB.1, TRB.2) TRB$chain <- "TRB" - + data2 <- rbind(TRA, TRB) data2 <- data2[rowSums(is.na(data2[2:8])) != 7, ] colnames(data2) <- c("barcode", "v_gene", "d_gene", "j_gene", "c_gene", "cdr3", "reads", "umis", "chain") data2$cdr3_nt <- NA data2 <- data2[,c("barcode", "chain", "v_gene", "d_gene", "j_gene", "c_gene", "cdr3_nt", "cdr3", "reads", "umis")] - + df[[i]] <- data2 df[[i]] <- df[[i]][with(df[[i]], order(reads, chain)),] } @@ -283,6 +283,6 @@ loadContigs <- function(input, for (i in seq_along(df)) { df[[i]] <- df[[i]][,c("cell_id", "locus", "consensus_count", "v_call", "d_call", "j_call", "c_call", "cdr3", "cdr3_aa", "productive")] colnames(df[[i]]) <- c("barcode", "chain", "reads", "v_gene", "d_gene", "j_gene", "c_gene", "cdr3_nt", "cdr3", "productive") - } + } return(df) } \ No newline at end of file diff --git a/man/clonalAbundance.Rd b/man/clonalAbundance.Rd index 52352220..671b04ba 100644 --- a/man/clonalAbundance.Rd +++ b/man/clonalAbundance.Rd @@ -16,18 +16,18 @@ clonalAbundance( ) } \arguments{ -\item{input.data}{The product of \code{\link{combineTCR}}, +\item{input.data}{The product of \code{\link{combineTCR}}, \code{\link{combineBCR}}, or \code{\link{combineExpression}}.} -\item{cloneCall}{How to call the clone - VDJC gene (\strong{gene}), +\item{cloneCall}{How to call the clone - VDJC gene (\strong{gene}), CDR3 nucleotide (\strong{nt}), CDR3 amino acid (\strong{aa}), -VDJC gene + CDR3 nucleotide (\strong{strict}) or a custom variable +VDJC gene + CDR3 nucleotide (\strong{strict}) or a custom variable in the data.} -\item{chain}{indicate if both or a specific chain should be used - +\item{chain}{indicate if both or a specific chain should be used - e.g. "both", "TRA", "TRG", "IGH", "IGL"} -\item{scale}{Converts the graphs into density plots in order to show +\item{scale}{Converts the graphs into density plots in order to show relative distributions.} \item{group.by}{The variable to use for grouping} @@ -38,29 +38,29 @@ to plot groups in order} \item{exportTable}{Returns the data frame used for forming the graph to the visualization.} -\item{palette}{Colors to use in visualization - input any +\item{palette}{Colors to use in visualization - input any \link[grDevices]{hcl.pals}.} } \value{ -ggplot of the total or relative abundance of clones +ggplot of the total or relative abundance of clones across quanta } \description{ -Displays the number of clones at specific frequencies by sample +Displays the number of clones at specific frequencies by sample or group. Visualization can either be a line graph ( -\strong{scale} = FALSE) using calculated numbers or density -plot (\strong{scale} = TRUE). Multiple sequencing runs can -be group together using the group parameter. If a matrix -output for the data is preferred, set +\strong{scale} = FALSE) using calculated numbers or density +plot (\strong{scale} = TRUE). Multiple sequencing runs can +be group together using the group parameter. If a matrix +output for the data is preferred, set \strong{exportTable} = TRUE. } \examples{ #Making combined contig data -combined <- combineTCR(contig_list, - samples = c("P17B", "P17L", "P18B", "P18L", +combined <- combineTCR(contig_list, + samples = c("P17B", "P17L", "P18B", "P18L", "P19B","P19L", "P20B", "P20L")) -clonalAbundance(combined, - cloneCall = "gene", +clonalAbundance(combined, + cloneCall = "gene", scale = FALSE) } diff --git a/man/clonalSizeDistribution.Rd b/man/clonalSizeDistribution.Rd index 2d34bb37..5c400479 100644 --- a/man/clonalSizeDistribution.Rd +++ b/man/clonalSizeDistribution.Rd @@ -16,36 +16,36 @@ clonalSizeDistribution( ) } \arguments{ -\item{input.data}{The product of \code{\link{combineTCR}}, +\item{input.data}{The product of \code{\link{combineTCR}}, \code{\link{combineBCR}}, or \code{\link{combineExpression}}.} -\item{cloneCall}{How to call the clone - VDJC gene (\strong{gene}), +\item{cloneCall}{How to call the clone - VDJC gene (\strong{gene}), CDR3 nucleotide (\strong{nt}), CDR3 amino acid (\strong{aa}), -VDJC gene + CDR3 nucleotide (\strong{strict}) or a custom variable +VDJC gene + CDR3 nucleotide (\strong{strict}) or a custom variable in the data.} -\item{chain}{indicate if both or a specific chain should be used - +\item{chain}{indicate if both or a specific chain should be used - e.g. "both", "TRA", "TRG", "IGH", "IGL".} \item{method}{The clustering parameter for the dendrogram.} -\item{threshold}{Numerical vector containing the thresholds +\item{threshold}{Numerical vector containing the thresholds the grid search was performed over.} \item{group.by}{The variable to use for grouping.} \item{exportTable}{Returns the data frame used for forming the graph.} -\item{palette}{Colors to use in visualization - input any +\item{palette}{Colors to use in visualization - input any \link[grDevices]{hcl.pals}.} } \value{ ggplot dendrogram of the clone size distribution } \description{ -This function produces a hierarchical clustering of clones by sample -using discrete gamma-GPD spliced threshold model. If using this -model please read and cite powerTCR (more info available at +This function produces a hierarchical clustering of clones by sample +using discrete gamma-GPD spliced threshold model. If using this +model please read and cite powerTCR (more info available at \href{https://pubmed.ncbi.nlm.nih.gov/30485278/}{PMID: 30485278}). } \details{ @@ -59,7 +59,7 @@ Where: \item{\eqn{\xi} is a shape parameter} \item{\eqn{x \ge \mu} if \eqn{\xi \ge 0} and \eqn{\mu \le x \le \mu - \sigma/\xi} if \eqn{\xi < 0}} } - + The probability density function (pdf) for the \strong{Gamma Distribution} is given by: \deqn{f(x|\alpha, \beta) = \frac{x^{\alpha-1} e^{-x/\beta}}{\beta^\alpha \Gamma(\alpha)}} @@ -73,8 +73,8 @@ Where: } \examples{ #Making combined contig data -combined <- combineTCR(contig_list, - samples = c("P17B", "P17L", "P18B", "P18L", +combined <- combineTCR(contig_list, + samples = c("P17B", "P17L", "P18B", "P18L", "P19B","P19L", "P20B", "P20L")) clonalSizeDistribution(combined, cloneCall = "strict", method="ward.D2") diff --git a/man/percentAA.Rd b/man/percentAA.Rd index eac3893d..27703a41 100644 --- a/man/percentAA.Rd +++ b/man/percentAA.Rd @@ -35,16 +35,16 @@ to plot groups in order} ggplot of stacked bar graphs of amino acid proportions } \description{ -This function the proportion of amino acids along the residues +This function the proportion of amino acids along the residues of the CDR3 amino acid sequence. } \examples{ #Making combined contig data -combined <- combineTCR(contig_list, - samples = c("P17B", "P17L", "P18B", "P18L", +combined <- combineTCR(contig_list, + samples = c("P17B", "P17L", "P18B", "P18L", "P19B","P19L", "P20B", "P20L")) -percentAA(combined, - chain = "TRB", +percentAA(combined, + chain = "TRB", aa.length = 20) } \concept{Summarize_Repertoire} diff --git a/man/positionalEntropy.Rd b/man/positionalEntropy.Rd index 0b79ca92..ed74563e 100644 --- a/man/positionalEntropy.Rd +++ b/man/positionalEntropy.Rd @@ -16,7 +16,7 @@ positionalEntropy( ) } \arguments{ -\item{input.data}{The product of \code{\link{combineTCR}}, +\item{input.data}{The product of \code{\link{combineTCR}}, \code{\link{combineBCR}}, or \code{\link{combineExpression}}} \item{chain}{"TRA", "TRB", "TRG", "TRG", "IGH", "IGL"} @@ -28,7 +28,7 @@ to plot groups in order} \item{aa.length}{The maximum length of the CDR3 amino acid sequence.} -\item{method}{The method to calculate the entropy/diversity - +\item{method}{The method to calculate the entropy/diversity - "shannon", "inv.simpson", "norm.entropy"} \item{exportTable}{Returns the data frame used for forming the graph} @@ -39,20 +39,20 @@ to plot groups in order} ggplot of line graph of diversity by position } \description{ -This function the diversity amino acids along the residues -of the CDR3 amino acid sequence. Please see -\code{\link{clonalDiversity}} for more information on -the underlying methods for diversity/entropy calculations. -Positions without variance will have a value reported as 0 +This function the diversity amino acids along the residues +of the CDR3 amino acid sequence. Please see +\code{\link{clonalDiversity}} for more information on +the underlying methods for diversity/entropy calculations. +Positions without variance will have a value reported as 0 for the purposes of comparison. } \examples{ #Making combined contig data -combined <- combineTCR(contig_list, - samples = c("P17B", "P17L", "P18B", "P18L", +combined <- combineTCR(contig_list, + samples = c("P17B", "P17L", "P18B", "P18L", "P19B","P19L", "P20B", "P20L")) -positionalEntropy(combined, - chain = "TRB", +positionalEntropy(combined, + chain = "TRB", aa.length = 20) } \concept{Summarize_Repertoire} diff --git a/man/positionalProperty.Rd b/man/positionalProperty.Rd index 787c33c3..d769b33c 100644 --- a/man/positionalProperty.Rd +++ b/man/positionalProperty.Rd @@ -16,7 +16,7 @@ positionalProperty( ) } \arguments{ -\item{input.data}{The product of \code{\link{combineTCR}}, +\item{input.data}{The product of \code{\link{combineTCR}}, \code{\link{combineBCR}}, or \code{\link{combineExpression}}} \item{chain}{"TRA", "TRB", "TRG", "TRG", "IGH", "IGL"} @@ -39,9 +39,9 @@ to plot groups in order} ggplot of line graph of diversity by position } \description{ -This function calculates the mean selected property for -amino acids along the residues of the CDR3 amino acid sequence. -The ribbon surrounding the individual line represents the 95% +This function calculates the mean selected property for +amino acids along the residues of the CDR3 amino acid sequence. +The ribbon surrounding the individual line represents the 95% confidence interval. } \details{ @@ -59,12 +59,12 @@ More information for the individual methods can be found at the following citati } \examples{ #Making combined contig data -combined <- combineTCR(contig_list, - samples = c("P17B", "P17L", "P18B", "P18L", +combined <- combineTCR(contig_list, + samples = c("P17B", "P17L", "P18B", "P18L", "P19B","P19L", "P20B", "P20L")) -positionalProperty(combined, +positionalProperty(combined, chain = "TRB", - method = "Atchley", + method = "Atchley", aa.length = 20) } \author{ diff --git a/man/vizGenes.Rd b/man/vizGenes.Rd index 12cfeabc..5c9b8346 100644 --- a/man/vizGenes.Rd +++ b/man/vizGenes.Rd @@ -17,47 +17,47 @@ vizGenes( ) } \arguments{ -\item{input.data}{The product of \code{\link{combineTCR}}, +\item{input.data}{The product of \code{\link{combineTCR}}, \code{\link{combineBCR}}, or \code{\link{combineExpression}}.} -\item{x.axis}{Gene segments to separate the x-axis, such as "TRAV", +\item{x.axis}{Gene segments to separate the x-axis, such as "TRAV", "TRBD", "IGKJ".} -\item{y.axis}{Variable to separate the y-axis, can be both categorical +\item{y.axis}{Variable to separate the y-axis, can be both categorical or other gene gene segments, such as "TRAV", "TRBD", "IGKJ".} \item{group.by}{Variable in which to group the diversity calculation.} \item{plot}{The type of plot to return - heatmap or barplot.} -\item{order}{Categorical variable to organize the x-axis, either +\item{order}{Categorical variable to organize the x-axis, either "gene" or "variance"} -\item{scale}{Converts the individual count of genes to proportion using +\item{scale}{Converts the individual count of genes to proportion using the total respective repertoire size} \item{exportTable}{Returns the data frame used for forming the graph.} -\item{palette}{Colors to use in visualization - input any +\item{palette}{Colors to use in visualization - input any \link[grDevices]{hcl.pals}.} } \value{ ggplot bar diagram or heatmap of gene usage } \description{ -This function will allow for the visualizing the distribution +This function will allow for the visualizing the distribution of the any VDJ and C gene of the TCR or BCR using heatmap or -bar chart. This function requires assumes two chains were used in -defining clone, if not, it will default to the only chain +bar chart. This function requires assumes two chains were used in +defining clone, if not, it will default to the only chain present regardless of the chain parameter. } \examples{ #Making combined contig data -combined <- combineTCR(contig_list, - samples = c("P17B", "P17L", "P18B", "P18L", +combined <- combineTCR(contig_list, + samples = c("P17B", "P17L", "P18B", "P18L", "P19B","P19L", "P20B", "P20L")) -vizGenes(combined, +vizGenes(combined, x.axis = "TRBV", y.axis = NULL, plot = "heatmap") From 2563a640e28d81091bb4b50e1e7792975b42775b Mon Sep 17 00:00:00 2001 From: Qile0317 Date: Sun, 27 Oct 2024 16:39:12 -0700 Subject: [PATCH 03/11] re-format & document & refactor loadContigs --- R/loadContigs.R | 110 ++++++++++++++++++++++++++------------------- man/loadContigs.Rd | 27 +++++------ 2 files changed, 78 insertions(+), 59 deletions(-) diff --git a/R/loadContigs.R b/R/loadContigs.R index 06119d87..5da719f6 100644 --- a/R/loadContigs.R +++ b/R/loadContigs.R @@ -32,61 +32,79 @@ #' #' @param input The directory in which contigs are located or a list with contig elements #' @param format The format of the single-cell contig, currently supporting: -#' "10X", "AIRR", "BD", "Dandelion", "JSON", "MiXCR", "ParseBio", "Omniscope", "TRUST4", and "WAT3R" +#' "10X", "AIRR", "BD", "Dandelion", "JSON", "MiXCR", "ParseBio", "Omniscope", +#' "TRUST4", "WAT3R", and "Immcantation" #' @importFrom utils read.csv read.delim #' @importFrom rjson fromJSON #' @export #' @concept Loading_and_Processing_Contigs #' @return List of contigs for compatibility with \code{\link{combineTCR}} or #' \code{\link{combineBCR}} -loadContigs <- function(input, - format = "10X") { - #Loading from directory, recursively - if (inherits(x=input, what ="character")) { - format.list <- list("WAT3R" = "barcode_results.csv", - "10X" = "filtered_contig_annotations.csv", - "AIRR" = "airr_rearrangement.tsv", - "Dandelion" = "all_contig_dandelion.tsv", - "Immcantation" = "_data.tsv", - "MiXCR" = "clones.tsv", - "JSON" = ".json", - "TRUST4" = "barcode_report.tsv", - "BD" = "Contigs_AIRR.tsv", - "Omniscope" =c("_OSB.csv", "_OST.csv"), - "ParseBio" = "barcode_report.tsv") +loadContigs <- function(input, format = "10X") { + + assert_that(is.string(input) || is.list(input) || is.data.frame(input)) + assert_that(is.string(format)) + assert_that(format %in% c( + "10X", "AIRR", "BD", "Dandelion", "JSON", "MiXCR", "ParseBio", + "Omniscope", "TRUST4", "WAT3R", "Immcantation" + )) + + #Loading from directory, recursively + df <- if (inherits(x = input, what = "character")) { + + format.list <- list("WAT3R" = "barcode_results.csv", + "10X" = "filtered_contig_annotations.csv", + "AIRR" = "airr_rearrangement.tsv", + "Dandelion" = "all_contig_dandelion.tsv", + "Immcantation" = "_data.tsv", + "MiXCR" = "clones.tsv", + "JSON" = ".json", + "TRUST4" = "barcode_report.tsv", + "BD" = "Contigs_AIRR.tsv", + "Omniscope" =c("_OSB.csv", "_OST.csv"), + "ParseBio" = "barcode_report.tsv") file.pattern <- format.list[[format]] - contig.files <- list.files(input, paste0(file.pattern, collapse = "|"), recursive = TRUE, full.names = TRUE) - - if (format %in% c("10X", "WAT3R", "Omniscope")) { - df <- lapply(contig.files, read.csv) - } else if(format %in% c("json")) { - df <- lapply(contig.files, function(x) { - tmp <- as.data.frame(fromJSON(x)) - }) + contig.files <- list.files( + input, + paste0(file.pattern, collapse = "|"), + recursive = TRUE, + full.names = TRUE + ) + + if (length(contig.files) == 0) { + warning("No files found in the directory") + return(list()) + } + + reader <- if (format == "json") { + function(x) as.data.frame(fromJSON(x)) + } else if (format %in% c("10X", "WAT3R", "Omniscope")) { + read.csv } else { - df <- lapply(contig.files, read.delim) + read.delim } - #Already loaded list or data frame - } else if (inherits(x=input, what ="list") | inherits(x=input, what ="data.frame")) { - df <- .checkList(input) - } - - loadFunc <- switch(format, - "10X" = .parse10x, - "AIRR" = .parseAIRR, - "Dandelion" = .parseDandelion, - "JSON" = .parseJSON, - "MiXCR" = .parseMiXCR, - "TRUST4" = .parseTRUST4, - "BD" = .parseBD, - "WAT3R" = .parseWAT3R, - "Omniscope" = .parseOmniscope, - "Immcantation" = .parseImmcantation, - "ParseBio" = .parseParse, - stop("Invalid format provided")) - - df <- loadFunc(df) - return(df) + + lapply(contig.files, reader) + + } else { # handle an already loaded list of dfs / 1 df + .checkList(input) + } + + loadFunc <- switch(format, + "10X" = .parse10x, + "AIRR" = .parseAIRR, + "Dandelion" = .parseDandelion, + "JSON" = .parseJSON, + "MiXCR" = .parseMiXCR, + "TRUST4" = .parseTRUST4, + "BD" = .parseBD, + "WAT3R" = .parseWAT3R, + "Omniscope" = .parseOmniscope, + "Immcantation" = .parseImmcantation, + "ParseBio" = .parseParse + ) + + loadFunc(df) } #Formats TRUST4 data diff --git a/man/loadContigs.Rd b/man/loadContigs.Rd index a96001be..186be719 100644 --- a/man/loadContigs.Rd +++ b/man/loadContigs.Rd @@ -9,33 +9,34 @@ loadContigs(input, format = "10X") \arguments{ \item{input}{The directory in which contigs are located or a list with contig elements} -\item{format}{The format of the single-cell contig, currently supporting: -"10X", "AIRR", "BD", "Dandelion", "JSON", "MiXCR", "ParseBio", "Omniscope", "TRUST4", and "WAT3R"} +\item{format}{The format of the single-cell contig, currently supporting: +"10X", "AIRR", "BD", "Dandelion", "JSON", "MiXCR", "ParseBio", "Omniscope", +"TRUST4", "WAT3R", and "Immcantation"} } \value{ -List of contigs for compatibility with \code{\link{combineTCR}} or +List of contigs for compatibility with \code{\link{combineTCR}} or \code{\link{combineBCR}} } \description{ -This function generates a contig list and formats the data to allow for -function with \code{\link{combineTCR}} or \code{\link{combineBCR}}. If -using data derived from filtered outputs of 10X Genomics, there is no +This function generates a contig list and formats the data to allow for +function with \code{\link{combineTCR}} or \code{\link{combineBCR}}. If +using data derived from filtered outputs of 10X Genomics, there is no need to use this function as the data is already compatible. } \details{ -The files that this function parses includes: +The files that this function parses includes: \itemize{ \item 10X = "filtered_contig_annotations.csv" - \item AIRR = "airr_rearrangement.tsv" - \item BD = "Contigs_AIRR.tsv" - \item Dandelion = "all_contig_dandelion.tsv" - \item Immcantation = "data.tsv" + \item AIRR = "airr_rearrangement.tsv" + \item BD = "Contigs_AIRR.tsv" + \item Dandelion = "all_contig_dandelion.tsv" + \item Immcantation = "data.tsv" \item JSON = ".json" \item ParseBio = "barcode_report.tsv" \item MiXCR = "clones.tsv" - \item Omniscope = ".csv" + \item Omniscope = ".csv" \item TRUST4 = "barcode_report.tsv" - \item WAT3R = "barcode_results.csv" + \item WAT3R = "barcode_results.csv" } } \examples{ From 88e6c9d7e82f3d2a1460176c83e07faefc593fec Mon Sep 17 00:00:00 2001 From: Qile0317 Date: Sun, 27 Oct 2024 16:47:37 -0700 Subject: [PATCH 04/11] slight improvement to loadContigs regex, pinpoint 429 cause --- R/loadContigs.R | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/R/loadContigs.R b/R/loadContigs.R index 5da719f6..2423fde0 100644 --- a/R/loadContigs.R +++ b/R/loadContigs.R @@ -52,21 +52,23 @@ loadContigs <- function(input, format = "10X") { #Loading from directory, recursively df <- if (inherits(x = input, what = "character")) { - format.list <- list("WAT3R" = "barcode_results.csv", - "10X" = "filtered_contig_annotations.csv", - "AIRR" = "airr_rearrangement.tsv", - "Dandelion" = "all_contig_dandelion.tsv", - "Immcantation" = "_data.tsv", - "MiXCR" = "clones.tsv", - "JSON" = ".json", - "TRUST4" = "barcode_report.tsv", - "BD" = "Contigs_AIRR.tsv", - "Omniscope" =c("_OSB.csv", "_OST.csv"), - "ParseBio" = "barcode_report.tsv") + format.list <- list( + "WAT3R" = "barcode_results.csv", + "10X" = "filtered_contig_annotations.csv", + "AIRR" = "airr_rearrangement.tsv", + "Dandelion" = "all_contig_dandelion.tsv", + "Immcantation" = "_data.tsv", + "MiXCR" = "clones.tsv", + "JSON" = ".json", + "TRUST4" = "barcode_report.tsv", + "BD" = "Contigs_AIRR.tsv", + "Omniscope" = c("_OSB.csv", "_OST.csv"), + "ParseBio" = "barcode_report.tsv" + ) file.pattern <- format.list[[format]] contig.files <- list.files( input, - paste0(file.pattern, collapse = "|"), + paste0("*", file.pattern, "$", collapse = "|"), recursive = TRUE, full.names = TRUE ) @@ -129,7 +131,7 @@ loadContigs <- function(input, format = "10X") { chain1 <- str_split(df[[i]]$chain2, ",", simplify = TRUE)[,seq_len(7)] chain1[chain1 == "*"] <- "None" } - colnames(chain1) <- c("v_gene", "d_gene", "j_gene", "c_gene", "cdr3_nt", "cdr3", "reads") + colnames(chain1) <- c("v_gene", "d_gene", "j_gene", "c_gene", "cdr3_nt", "cdr3", "reads") # issue 429 chain1 <- data.frame(barcode = df[[i]][,1], chain1) data2 <- rbind(chain1, chain2) data2[data2 == ""] <- NA From a51807bb88172f520e64d1c51761037b0d09c110 Mon Sep 17 00:00:00 2001 From: Qile0317 Date: Sun, 27 Oct 2024 16:51:13 -0700 Subject: [PATCH 05/11] rename loadContig internal var --- R/loadContigs.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/loadContigs.R b/R/loadContigs.R index 2423fde0..aad79b7d 100644 --- a/R/loadContigs.R +++ b/R/loadContigs.R @@ -50,7 +50,7 @@ loadContigs <- function(input, format = "10X") { )) #Loading from directory, recursively - df <- if (inherits(x = input, what = "character")) { + rawDataDfList <- if (inherits(x = input, what = "character")) { format.list <- list( "WAT3R" = "barcode_results.csv", @@ -106,7 +106,7 @@ loadContigs <- function(input, format = "10X") { "ParseBio" = .parseParse ) - loadFunc(df) + loadFunc(rawDataDfList) } #Formats TRUST4 data From 3fdfc95ec08c074639874c67d5e5db5658ea24bc Mon Sep 17 00:00:00 2001 From: Qile0317 Date: Sun, 27 Oct 2024 17:03:11 -0700 Subject: [PATCH 06/11] reformat TRUST4 parser --- R/loadContigs.R | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/R/loadContigs.R b/R/loadContigs.R index aad79b7d..53b66b02 100644 --- a/R/loadContigs.R +++ b/R/loadContigs.R @@ -109,30 +109,30 @@ loadContigs <- function(input, format = "10X") { loadFunc(rawDataDfList) } -#Formats TRUST4 data +#' Formats TRUST4 data #' @importFrom stringr str_split .parseTRUST4 <- function(df) { for (i in seq_along(df)) { colnames(df[[i]])[1] <- "barcode" df[[i]][df[[i]] == "*"] <- NA - + if(length(which(is.na(df[[i]]$chain1))) == length(df[[i]]$chain1)) { - chain2 <- matrix(ncol = 7, nrow = length(df[[i]]$chain1)) + chain2 <- matrix(ncol = 7, nrow = length(df[[i]]$chain1)) } else { - chain2 <- str_split(df[[i]]$chain1, ",", simplify = TRUE)[,seq_len(7)] - chain2[chain2 == "*"] <- "None" + chain2 <- str_split(df[[i]]$chain1, ",", simplify = TRUE)[, seq_len(7)] + chain2[chain2 == "*"] <- "None" } colnames(chain2) <- c("v_gene", "d_gene", "j_gene", "c_gene", "cdr3_nt", "cdr3", "reads") - chain2 <- data.frame(barcode = df[[i]][,1], chain2) - + chain2 <- data.frame(barcode = df[[i]][, 1], chain2) + if(length(which(is.na(df[[i]]$chain2))) == length(df[[i]]$chain2)) { - chain1 <- matrix(ncol = 7, nrow = length(df[[i]]$chain2)) + chain1 <- matrix(ncol = 7, nrow = length(df[[i]]$chain2)) } else { - chain1 <- str_split(df[[i]]$chain2, ",", simplify = TRUE)[,seq_len(7)] - chain1[chain1 == "*"] <- "None" + chain1 <- str_split(df[[i]]$chain2, ",", simplify = TRUE)[, seq_len(7)] + chain1[chain1 == "*"] <- "None" } colnames(chain1) <- c("v_gene", "d_gene", "j_gene", "c_gene", "cdr3_nt", "cdr3", "reads") # issue 429 - chain1 <- data.frame(barcode = df[[i]][,1], chain1) + chain1 <- data.frame(barcode = df[[i]][, 1], chain1) data2 <- rbind(chain1, chain2) data2[data2 == ""] <- NA df[[i]] <- data2 From bd77186f456b4a17373dd6d286ff8778bccb7b02 Mon Sep 17 00:00:00 2001 From: Qile0317 Date: Sun, 27 Oct 2024 17:08:15 -0700 Subject: [PATCH 07/11] more reformatting of TRUST4 --- R/loadContigs.R | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/R/loadContigs.R b/R/loadContigs.R index 53b66b02..3513159e 100644 --- a/R/loadContigs.R +++ b/R/loadContigs.R @@ -5,7 +5,7 @@ #' using data derived from filtered outputs of 10X Genomics, there is no #' need to use this function as the data is already compatible. #' -#' The files that this function parses includes: +#' The files that this function parses includes: #' \itemize{ #' \item 10X = "filtered_contig_annotations.csv" #' \item AIRR = "airr_rearrangement.tsv" @@ -30,7 +30,8 @@ #' WAT3R <- read.csv("https://www.borch.dev/uploads/contigs/WAT3R_contigs.csv") #' contig.list <- loadContigs(WAT3R, format = "WAT3R") #' -#' @param input The directory in which contigs are located or a list with contig elements +#' @param input The directory in which contigs are located or a list with contig +#' elements #' @param format The format of the single-cell contig, currently supporting: #' "10X", "AIRR", "BD", "Dandelion", "JSON", "MiXCR", "ParseBio", "Omniscope", #' "TRUST4", "WAT3R", and "Immcantation" @@ -112,11 +113,13 @@ loadContigs <- function(input, format = "10X") { #' Formats TRUST4 data #' @importFrom stringr str_split .parseTRUST4 <- function(df) { + for (i in seq_along(df)) { + colnames(df[[i]])[1] <- "barcode" df[[i]][df[[i]] == "*"] <- NA - if(length(which(is.na(df[[i]]$chain1))) == length(df[[i]]$chain1)) { + if (length(which(is.na(df[[i]]$chain1))) == length(df[[i]]$chain1)) { chain2 <- matrix(ncol = 7, nrow = length(df[[i]]$chain1)) } else { chain2 <- str_split(df[[i]]$chain1, ",", simplify = TRUE)[, seq_len(7)] @@ -125,7 +128,7 @@ loadContigs <- function(input, format = "10X") { colnames(chain2) <- c("v_gene", "d_gene", "j_gene", "c_gene", "cdr3_nt", "cdr3", "reads") chain2 <- data.frame(barcode = df[[i]][, 1], chain2) - if(length(which(is.na(df[[i]]$chain2))) == length(df[[i]]$chain2)) { + if (length(which(is.na(df[[i]]$chain2))) == length(df[[i]]$chain2)) { chain1 <- matrix(ncol = 7, nrow = length(df[[i]]$chain2)) } else { chain1 <- str_split(df[[i]]$chain2, ",", simplify = TRUE)[, seq_len(7)] @@ -137,8 +140,16 @@ loadContigs <- function(input, format = "10X") { data2[data2 == ""] <- NA df[[i]] <- data2 } - df <- .chain.parser(df) - return(df) + + .chain.parser(df) +} + +#Grabs the chain info from v_gene +.chain.parser <- function(df) { + lapply(df, function(x) { + x$chain <- substr(x$v_gene, 1, 3) + x + }) } #Formats wat3r data @@ -204,14 +215,6 @@ loadContigs <- function(input, format = "10X") { } return(df) } -#Grabs the chain info from v_gene -.chain.parser <- function(df) { - for (i in seq_along(df)) { - df[[i]]$chain <- substr(df[[i]][,"v_gene"],1,3) - } - return(df) -} - .parseOmniscope <- function(df) { for (i in seq_along(df)) { From df2515ed8b1fd57c00ac634e8f1e1927bfd14661 Mon Sep 17 00:00:00 2001 From: Qile0317 Date: Sun, 27 Oct 2024 17:26:49 -0700 Subject: [PATCH 08/11] fix .parseTRUST4 for 1 row edgecase --- R/loadContigs.R | 10 +++++----- man/loadContigs.Rd | 5 +++-- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/R/loadContigs.R b/R/loadContigs.R index 3513159e..c71bd541 100644 --- a/R/loadContigs.R +++ b/R/loadContigs.R @@ -110,7 +110,7 @@ loadContigs <- function(input, format = "10X") { loadFunc(rawDataDfList) } -#' Formats TRUST4 data +#Formats TRUST4 data #' @importFrom stringr str_split .parseTRUST4 <- function(df) { @@ -122,7 +122,7 @@ loadContigs <- function(input, format = "10X") { if (length(which(is.na(df[[i]]$chain1))) == length(df[[i]]$chain1)) { chain2 <- matrix(ncol = 7, nrow = length(df[[i]]$chain1)) } else { - chain2 <- str_split(df[[i]]$chain1, ",", simplify = TRUE)[, seq_len(7)] + chain2 <- str_split(df[[i]]$chain1, ",", simplify = TRUE)[, seq_len(7), drop = FALSE] chain2[chain2 == "*"] <- "None" } colnames(chain2) <- c("v_gene", "d_gene", "j_gene", "c_gene", "cdr3_nt", "cdr3", "reads") @@ -131,14 +131,14 @@ loadContigs <- function(input, format = "10X") { if (length(which(is.na(df[[i]]$chain2))) == length(df[[i]]$chain2)) { chain1 <- matrix(ncol = 7, nrow = length(df[[i]]$chain2)) } else { - chain1 <- str_split(df[[i]]$chain2, ",", simplify = TRUE)[, seq_len(7)] + chain1 <- str_split(df[[i]]$chain2, ",", simplify = TRUE)[, seq_len(7), drop = FALSE] chain1[chain1 == "*"] <- "None" } - colnames(chain1) <- c("v_gene", "d_gene", "j_gene", "c_gene", "cdr3_nt", "cdr3", "reads") # issue 429 + colnames(chain1) <- c("v_gene", "d_gene", "j_gene", "c_gene", "cdr3_nt", "cdr3", "reads") chain1 <- data.frame(barcode = df[[i]][, 1], chain1) data2 <- rbind(chain1, chain2) data2[data2 == ""] <- NA - df[[i]] <- data2 + df[[i]] <- data2 # is it necessary to drop rows that are fully NA with an existing barcode? } .chain.parser(df) diff --git a/man/loadContigs.Rd b/man/loadContigs.Rd index 186be719..8025e626 100644 --- a/man/loadContigs.Rd +++ b/man/loadContigs.Rd @@ -7,7 +7,8 @@ loadContigs(input, format = "10X") } \arguments{ -\item{input}{The directory in which contigs are located or a list with contig elements} +\item{input}{The directory in which contigs are located or a list with contig +elements} \item{format}{The format of the single-cell contig, currently supporting: "10X", "AIRR", "BD", "Dandelion", "JSON", "MiXCR", "ParseBio", "Omniscope", @@ -24,7 +25,7 @@ using data derived from filtered outputs of 10X Genomics, there is no need to use this function as the data is already compatible. } \details{ -The files that this function parses includes: +The files that this function parses includes: \itemize{ \item 10X = "filtered_contig_annotations.csv" \item AIRR = "airr_rearrangement.tsv" From 4e070946b3b22d09d2dd6e4042df1b9f90d40eda Mon Sep 17 00:00:00 2001 From: Qile0317 Date: Sun, 27 Oct 2024 17:27:18 -0700 Subject: [PATCH 09/11] add dev to PR --- .github/workflows/R-CMD-check.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 0ce19cc8..9e047d18 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -4,7 +4,7 @@ on: push: branches: [main, master, v2] pull_request: - branches: [main, master, v2] + branches: [main, master, v2, dev] name: R-CMD-check From af22d98bab8c21f61825e47547a11683b33d0636 Mon Sep 17 00:00:00 2001 From: Qile0317 Date: Sun, 27 Oct 2024 17:35:57 -0700 Subject: [PATCH 10/11] update NEWS --- NEWS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/NEWS.md b/NEWS.md index 69719082..65bb7a8a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -5,6 +5,7 @@ * Removed unnecessary code remnant in ```clonalLength()``` * Allow one sample to be plotted by ```percentVJ()``` * Fixed issue with ```positionalProperty()``` and exportTable +* Fixed issue with ```loadContigs()``` edgecase when TRUST4 data only has 1 row. # scRepertoire VERSION 2.0.7 From c18c1d0f58b629442196177c893530b4d235bb76 Mon Sep 17 00:00:00 2001 From: Qile0317 Date: Sun, 27 Oct 2024 17:45:51 -0700 Subject: [PATCH 11/11] refactor parseTRUST4 --- R/loadContigs.R | 49 +++++++++++++++++++++++++------------------------ 1 file changed, 25 insertions(+), 24 deletions(-) diff --git a/R/loadContigs.R b/R/loadContigs.R index c71bd541..354e7402 100644 --- a/R/loadContigs.R +++ b/R/loadContigs.R @@ -114,34 +114,35 @@ loadContigs <- function(input, format = "10X") { #' @importFrom stringr str_split .parseTRUST4 <- function(df) { - for (i in seq_along(df)) { - - colnames(df[[i]])[1] <- "barcode" - df[[i]][df[[i]] == "*"] <- NA - - if (length(which(is.na(df[[i]]$chain1))) == length(df[[i]]$chain1)) { - chain2 <- matrix(ncol = 7, nrow = length(df[[i]]$chain1)) + processChain <- function(data, chain_col) { + if (all(is.na(data[[chain_col]]))) { + chain <- matrix(ncol = 7, nrow = length(data[[chain_col]])) } else { - chain2 <- str_split(df[[i]]$chain1, ",", simplify = TRUE)[, seq_len(7), drop = FALSE] - chain2[chain2 == "*"] <- "None" + chain <- str_split(data[[chain_col]], ",", simplify = TRUE) + chain <- chain[, seq_len(7), drop = FALSE] + chain[chain == "*"] <- "None" } - colnames(chain2) <- c("v_gene", "d_gene", "j_gene", "c_gene", "cdr3_nt", "cdr3", "reads") - chain2 <- data.frame(barcode = df[[i]][, 1], chain2) - - if (length(which(is.na(df[[i]]$chain2))) == length(df[[i]]$chain2)) { - chain1 <- matrix(ncol = 7, nrow = length(df[[i]]$chain2)) - } else { - chain1 <- str_split(df[[i]]$chain2, ",", simplify = TRUE)[, seq_len(7), drop = FALSE] - chain1[chain1 == "*"] <- "None" - } - colnames(chain1) <- c("v_gene", "d_gene", "j_gene", "c_gene", "cdr3_nt", "cdr3", "reads") - chain1 <- data.frame(barcode = df[[i]][, 1], chain1) - data2 <- rbind(chain1, chain2) - data2[data2 == ""] <- NA - df[[i]] <- data2 # is it necessary to drop rows that are fully NA with an existing barcode? + colnames(chain) <- c( + "v_gene", "d_gene", "j_gene", "c_gene", "cdr3_nt", "cdr3", "reads" + ) + data.frame(barcode = data$barcode, chain) } - .chain.parser(df) + formattedDfs <- lapply(df, function(data) { + + colnames(data)[1] <- "barcode" + data[data == "*"] <- NA + + # not a mistake, opposite definitions in TRUST4 and scRepertoire + chain1 <- processChain(data, "chain2") + chain2 <- processChain(data, "chain1") + + combined_data <- rbind(chain1, chain2) + combined_data[combined_data == ""] <- NA + combined_data + }) + # is it necessary to drop rows that are fully NA with an existing barcode? + .chain.parser(formattedDfs) } #Grabs the chain info from v_gene