diff --git a/README.md b/README.md index e7c551c0..41865280 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,8 @@ The manuscript is available in [Nature Protocols](https://www.nature.com/article ## What's new +Version 1.2.2 fixes some critical bugs that affected the performance of the `aberrantExpression` pipeline, and allows sample IDs to be numeric. + As of version 1.2.1 DROP has a new module that performs RNA-seq variant calling. The input are BAM files and the output either a single-sample or a multi-sample VCF file (option specified by the user) annotated with allele frequencies from gnomAD (if specified by the user). The sample annotation table does not need to be changed, but several new parameters in the config file have to be added and tuned. For more info, refer to the [documentation](https://gagneurlab-drop.readthedocs.io/en/latest/prepare.html#rna-variant-calling-dictionary). Also, the integration of external split and non-split counts to detect aberrant splicing is now possible. Simply specify in a new column in the sample annotation the directory containing the counts. For more info, refer to the [documentation](https://gagneurlab-drop.readthedocs.io/en/latest/prepare.html#external-count-examples). diff --git a/docs/source/conf.py b/docs/source/conf.py index ee8cf65d..c95e81de 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -23,7 +23,7 @@ author = 'Michaela Müller' # The full version, including alpha/beta/rc tags -release_ = '1.2.1' +release_ = '1.2.2' diff --git a/drop/__init__.py b/drop/__init__.py index 1fc16f83..8a6b8bad 100644 --- a/drop/__init__.py +++ b/drop/__init__.py @@ -4,5 +4,5 @@ from . import utils from . import demo -__version__ = "1.2.1" +__version__ = "1.2.2" diff --git a/drop/cli.py b/drop/cli.py index 6e9b9a41..9b7b8b3b 100644 --- a/drop/cli.py +++ b/drop/cli.py @@ -17,7 +17,7 @@ @click.group() @click_log.simple_verbosity_option(logger) -@click.version_option('1.2.1',prog_name='drop') +@click.version_option('1.2.2',prog_name='drop') def main(): diff --git a/drop/config/SampleAnnotation.py b/drop/config/SampleAnnotation.py index 8bcadea3..f6f7911b 100644 --- a/drop/config/SampleAnnotation.py +++ b/drop/config/SampleAnnotation.py @@ -181,6 +181,9 @@ def subsetSampleAnnotation(self, column, values, subset=None): # check if column is valid elif column not in sa_cols: raise KeyError(f"Column '{column}' not present in sample annotation.") + # return empty subset if empty + elif subset.empty: + return subset #subset column for matching values else: return utils.subsetBy(subset, column, values) @@ -272,7 +275,6 @@ def getImportCountFiles(self, annotation, group, file_type: str = "GENE_COUNTS_F #subset for the annotation_key in the annotation group and the group_key in the group subset = self.subsetSampleAnnotation(annotation_key, annotation) subset = self.subsetSampleAnnotation(group_key, group, subset) - ans = subset[file_type].tolist() if asSet: ans = set(ans) diff --git a/drop/config/submodules/AberrantExpression.py b/drop/config/submodules/AberrantExpression.py index 5e25b1e6..adb29b9c 100644 --- a/drop/config/submodules/AberrantExpression.py +++ b/drop/config/submodules/AberrantExpression.py @@ -48,15 +48,16 @@ def getCountFiles(self, annotation, group): :return: list of files """ - # if sample annotation table does not contain GENE_COUNTS_FILE column. return no external counts - if("GENE_COUNTS_FILE" not in self.sampleAnnotation.SAMPLE_ANNOTATION_COLUMNS): - return [] bam_IDs = self.sampleAnnotation.getIDsByGroup(group, assay="RNA") file_stump = self.processedDataDir / "aberrant_expression" / annotation / "counts" / "{sampleID}.Rds" count_files = expand(str(file_stump), sampleID=bam_IDs) - extCountFiles = self.sampleAnnotation.getImportCountFiles(annotation, group, file_type="GENE_COUNTS_FILE") - count_files.extend(extCountFiles) + # if sample annotation table does not contain GENE_COUNTS_FILE column. return no external counts + if("GENE_COUNTS_FILE" not in self.sampleAnnotation.SAMPLE_ANNOTATION_COLUMNS): + extCountFiles = [] + else: + extCountFiles = self.sampleAnnotation.getImportCountFiles(annotation, group, file_type="GENE_COUNTS_FILE") + count_files.extend(extCountFiles) return count_files def getCountParams(self, rnaID): diff --git a/drop/modules/aberrant-expression-pipeline/Counting/filterCounts.R b/drop/modules/aberrant-expression-pipeline/Counting/filterCounts.R index 82405251..da543e6a 100644 --- a/drop/modules/aberrant-expression-pipeline/Counting/filterCounts.R +++ b/drop/modules/aberrant-expression-pipeline/Counting/filterCounts.R @@ -36,7 +36,17 @@ ods <- filterExpression(ods, gtfFile=txdb, filter=FALSE, # add column for genes with at least 1 gene rowData(ods)$counted1sample = rowSums(assay(ods)) > 0 -has_external <- !(all(ods@colData$GENE_COUNTS_FILE == "") || is.null(ods@colData$GENE_COUNTS_FILE)) +# External data check +if (is.null(ods@colData$GENE_COUNTS_FILE)){ #column does not exist in sample annotation table + has_external <- FALSE +}else if(all(is.na(ods@colData$GENE_COUNTS_FILE))){ #column exists but it has no values + has_external <- FALSE +}else if(all(ods@colData$GENE_COUNTS_FILE == "")){ #column exists with non-NA values but this group has all empty strings + has_external <- FALSE +}else{ #column exists with non-NA values and this group has at least 1 non-empty string + has_external <- TRUE +} + if(has_external){ ods@colData$isExternal <- as.factor(ods@colData$GENE_COUNTS_FILE != "") }else{ diff --git a/drop/modules/aberrant-expression-pipeline/Counting/mergeCounts.R b/drop/modules/aberrant-expression-pipeline/Counting/mergeCounts.R index ef847a52..fd62a4e9 100644 --- a/drop/modules/aberrant-expression-pipeline/Counting/mergeCounts.R +++ b/drop/modules/aberrant-expression-pipeline/Counting/mergeCounts.R @@ -56,10 +56,10 @@ total_counts <- SummarizedExperiment(assays=list(counts=merged_assays)) rowRanges(total_counts) <- count_ranges # Add sample annotation data (colData) -sample_anno <- fread(snakemake@config$sampleAnnotation) +sample_anno <- fread(snakemake@config$sampleAnnotation, + colClasses = c(RNA_ID = 'character', DNA_ID = 'character')) sample_anno <- sample_anno[, .SD[1], by = RNA_ID] col_data <- data.table(RNA_ID = as.character(colnames(total_counts))) -sample_anno[, RNA_ID := as.character(RNA_ID)] col_data <- left_join(col_data, sample_anno, by = "RNA_ID") rownames(col_data) <- col_data$RNA_ID colData(total_counts) <- as(col_data, "DataFrame") diff --git a/drop/modules/aberrant-expression-pipeline/OUTRIDER/results.R b/drop/modules/aberrant-expression-pipeline/OUTRIDER/results.R index c76b252a..4096aca4 100644 --- a/drop/modules/aberrant-expression-pipeline/OUTRIDER/results.R +++ b/drop/modules/aberrant-expression-pipeline/OUTRIDER/results.R @@ -55,7 +55,8 @@ if(!is.null(gene_annot_dt$gene_name)){ } # Add HPO terms, requires online connection and for there to be annotated HPO terms -sa <- fread(snakemake@config$sampleAnnotation) +sa <- fread(snakemake@config$sampleAnnotation, + colClasses = c(RNA_ID = 'character', DNA_ID = 'character')) if(!is.null(sa$HPO_TERMS) & nrow(res) > 0){ if(!all(is.na(sa$HPO_TERMS)) & ! all(sa$HPO_TERMS == '')){ res <- add_HPO_cols(res, hpo_file = snakemake@params$hpoFile) diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/exportCounts.R b/drop/modules/aberrant-splicing-pipeline/Counting/exportCounts.R index 253b2d47..5577aee3 100644 --- a/drop/modules/aberrant-splicing-pipeline/Counting/exportCounts.R +++ b/drop/modules/aberrant-splicing-pipeline/Counting/exportCounts.R @@ -10,7 +10,7 @@ #' input: #' - annotation: '`sm cfg.getProcessedDataDir() + "/preprocess/{annotation}/txdb.db"`' #' - fds_theta: '`sm cfg.getProcessedDataDir() + -#' "/aberrant_splicing/datasets/savedObjects/raw-{dataset}/theta.h5"`' +#' "/aberrant_splicing/datasets/savedObjects/raw-local-{dataset}/theta.h5"`' #' output: #' - k_counts: '`sm expand(cfg.exportCounts.getFilePattern(str_=True, expandStr=True) + "/k_{metric}_counts.tsv.gz", metric=["j", "theta"])`' #' - n_counts: '`sm expand(cfg.exportCounts.getFilePattern(str_=True, expandStr=True) + "/n_{metric}_counts.tsv.gz", metric=["psi5", "psi3", "theta"])`' diff --git a/drop/modules/aberrant-splicing-pipeline/FRASER/07_extract_results_FraseR.R b/drop/modules/aberrant-splicing-pipeline/FRASER/07_extract_results_FraseR.R index f1fe06d1..fbd47424 100644 --- a/drop/modules/aberrant-splicing-pipeline/FRASER/07_extract_results_FraseR.R +++ b/drop/modules/aberrant-splicing-pipeline/FRASER/07_extract_results_FraseR.R @@ -81,8 +81,7 @@ if(nrow(res_junc_dt) > 0){ # add colData to the results res_junc_dt <- merge(res_junc_dt, as.data.table(colData(fds)), by = "sampleID") - res_junc_dt[, c("bamFile", "pairedEnd") := NULL] - setnames(res_junc_dt, 'STRAND', 'STRAND_SPECIFIC') # otherwise it's confusing with the 'strand' column from the junction + res_junc_dt[, c("bamFile", "pairedEnd", "STRAND") := NULL] } else{ warning("The aberrant splicing pipeline gave 0 results for the ", dataset, " dataset.") } @@ -91,11 +90,11 @@ if(nrow(res_junc_dt) > 0){ if(length(res_junc) > 0){ res_genes_dt <- resultsByGenes(res_junc) %>% as.data.table res_genes_dt <- merge(res_genes_dt, as.data.table(colData(fds)), by = "sampleID") - res_genes_dt[, c("bamFile", "pairedEnd") := NULL] - setnames(res_genes_dt, 'STRAND', 'STRAND_SPECIFIC') - + res_genes_dt[, c("bamFile", "pairedEnd", "STRAND") := NULL] + # add HPO overlap information - sa <- fread(snakemake@config$sampleAnnotation) + sa <- fread(snakemake@config$sampleAnnotation, + colClasses = c(RNA_ID = 'character', DNA_ID = 'character')) if(!is.null(sa$HPO_TERMS)){ if(!all(is.na(sa$HPO_TERMS)) & ! all(sa$HPO_TERMS == '')){ res_genes_dt <- add_HPO_cols(res_genes_dt, hpo_file = snakemake@params$hpoFile) diff --git a/drop/modules/mae-pipeline/QC/DNA_RNA_matrix_plot.R b/drop/modules/mae-pipeline/QC/DNA_RNA_matrix_plot.R index eba0bd9b..a00b6bb6 100644 --- a/drop/modules/mae-pipeline/QC/DNA_RNA_matrix_plot.R +++ b/drop/modules/mae-pipeline/QC/DNA_RNA_matrix_plot.R @@ -59,7 +59,8 @@ ggplot(melt_mat, aes(value)) + geom_histogram(fill = 'cadetblue4', binwidth = 0. #' * Is the sample a relative of the other? #' -sa <- fread(snakemake@config$sampleAnnotation)[, .(DNA_ID, RNA_ID)] +sa <- fread(snakemake@config$sampleAnnotation, + colClasses = c(RNA_ID = 'character', DNA_ID = 'character'))[, .(DNA_ID, RNA_ID)] sa[, ANNOTATED_MATCH := TRUE] colnames(melt_mat)[1:2] <- c('DNA_ID', 'RNA_ID') diff --git a/drop/modules/mae-pipeline/QC/create_matrix_dna_rna_cor.R b/drop/modules/mae-pipeline/QC/create_matrix_dna_rna_cor.R index 7f8f910d..9fe9d906 100644 --- a/drop/modules/mae-pipeline/QC/create_matrix_dna_rna_cor.R +++ b/drop/modules/mae-pipeline/QC/create_matrix_dna_rna_cor.R @@ -27,14 +27,15 @@ suppressPackageStartupMessages({ }) register(MulticoreParam(snakemake@threads)) -sa <- fread(snakemake@config$sampleAnnotation) +sa <- fread(snakemake@config$sampleAnnotation, + colClasses = c(RNA_ID = 'character', DNA_ID = 'character')) # Read the test vcf as GRanges gr_test <- readVcf(snakemake@config$mae$qcVcf) %>% granges() mcols(gr_test)$GT <- "0/0" # Obtain the rna and vcf files -rna_samples <- snakemake@params$rnaIds +rna_samples <- as.character(snakemake@params$rnaIds) mae_res <- snakemake@input$mae_res vcf_cols <- sa[RNA_ID %in% rna_samples, .(DNA_ID, DNA_VCF_FILE)] %>% unique %>% na.omit() @@ -46,7 +47,7 @@ N <- length(vcf_files) lp <- bplapply(1:N, function(i){ # Read sample vcf file - sample <- wes_samples[i] + sample <- wes_samples[i] %>% as.character() param <- ScanVcfParam(fixed=NA, info='NT', geno='GT', samples=sample, trimEmpty=TRUE) vcf_sample <- readVcf(vcf_files[i], param = param, row.names = FALSE) # Get GRanges and add Genotype diff --git a/drop/template/config.yaml b/drop/template/config.yaml index 14473d8a..353ca699 100644 --- a/drop/template/config.yaml +++ b/drop/template/config.yaml @@ -1,10 +1,10 @@ projectTitle: "DROP: Detection of RNA Outliers Pipeline" -root: # root directory of all intermediate output +root: # root directory of all output objects and tables htmlOutputPath: # path for HTML rendered reports indexWithFolderName: true # whether the root base name should be part of the index name hpoFile: null # if null, downloads it from webserver -sampleAnnotation: # path to sample annotation (see documenation on how to create it) +sampleAnnotation: # path to sample annotation (see documentation on how to create it) root: Output sampleAnnotation: Data/sample_annotation.tsv @@ -14,18 +14,18 @@ genomeAssembly: hg19 genome: Data/hg19.fa hpoFile: null -random_seed: true # just for demo data, remove for proper analysis +random_seed: true # just for demo data, remove for analysis exportCounts: # specify which gene annotations to include and which # groups to exclude when exporting counts geneAnnotations: - v29 - excludeGroups + excludeGroups: - group1 -genome: # path to genome sequence in fasta format. - # You can define multiple reference genomes in yaml format, ncbi: path_to_ncbi, ucsc: path_to_ucsc - # the keywords that define the path should be in the GENOME column of the SA table +genome: # path to reference genome sequence in fasta format. + # You can define multiple reference genomes in yaml format, ncbi: path/to/ncbi, ucsc: path/to/ucsc + # the keywords that define the path should be in the GENOME column of the sample annotation table aberrantExpression: run: true diff --git a/drop/utils.py b/drop/utils.py index 41836afb..bece23e8 100644 --- a/drop/utils.py +++ b/drop/utils.py @@ -86,6 +86,8 @@ def subsetBy(df, column, values): if not isinstance(values, str) : inner_regex = "(" + "|".join(values) + ")" + if df[column].isnull().all(): + return df[[False for i in range(df.shape[0])]] return df[df[column].str.contains("(?:^|,)" + inner_regex + "(?:,|$)", na = False)] def deep_merge_dict(dict1: dict, dict2: dict, inplace: bool = False): diff --git a/setup.cfg b/setup.cfg index 9601b613..58ea8f1f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 1.2.1 +current_version = 1.2.2 commit = True diff --git a/setup.py b/setup.py index 35ffe8e1..d77a4a8d 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ setuptools.setup( name="drop", - version="1.2.1", + version="1.2.2", author="Vicente A. Yépez, Michaela Müller, Nicholas H. Smith, Daniela Klaproth-Andrade, Luise Schuller, Ines Scheller, Christian Mertes , Julien Gagneur ", author_email="yepez@in.tum.de",