From 5b99b9bbf01072c88e5ea370d5ce86cd5bd9a124 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Mon, 20 Nov 2023 12:13:32 +0000 Subject: [PATCH 01/10] [skip ci] overhaul tximport.r --- bin/tximport.r | 177 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 117 insertions(+), 60 deletions(-) diff --git a/bin/tximport.r b/bin/tximport.r index ee3d8b212..75ff41ef4 100755 --- a/bin/tximport.r +++ b/bin/tximport.r @@ -7,86 +7,143 @@ library(tximport) args = commandArgs(trailingOnly=TRUE) if (length(args) < 4) { - stop("Usage: tximport.r ", call.=FALSE) + stop("Usage: tximport.r ", call.=FALSE) } -coldata = args[1] +coldata_path = args[1] path = args[2] -sample_name = args[3] +prefix = args[3] quant_type = args[4] tx2gene_path = args[5] -prefix = sample_name +## Functions -info = file.info(tx2gene_path) -if (info$size == 0) { - tx2gene = NULL -} else { - rowdata = read.csv(tx2gene_path, sep="\t", header = FALSE) - colnames(rowdata) = c("tx", "gene_id", "gene_name") - tx2gene = rowdata[,1:2] +# Function to build a table from a SummarizedExperiment object +build_table <- function(se.obj, slot) { + cbind(rowData(se.obj)[,1:2], assays(se.obj)[[slot]]) +} + +# Function to write a table to a file +write_se_table <- function(params) { + file_name <- paste0(prefix, ".", params$suffix) + write.table(build_table(params$obj, params$slot), file_name, sep="\t", quote=FALSE, row.names = FALSE) } +# Read and process transcript meta +read_transcript_info <- function(tinfo_path){ + + info = file.info(tinfo_path) + if (info$size == 0) { + stop("tx2gene file is empty") + } + + # Read transcript information and set column names + transcript_info = read.csv( + tinfo_path, + sep="\t", + header = FALSE, + col.names = c("tx", "gene_id", "gene_name") + ) + + # Identify and add extra transcripts + extra = setdiff(rownames(txi[[1]]), as.character(transcript_info[["tx"]])) + transcript_info = rbind(transcript_info, data.frame(tx=extra, gene_id=extra, gene_name=extra)) + + # Reorder rows to match the order in txi and set row names + transcript_info = transcript_info[match(rownames(txi[[1]]), transcript_info[["tx"]]), ] + rownames(transcript_info) = transcript_info[["tx"]] + + # Return various formulations of the info for downstream use + list( + transcript = transcript_info, + gene = unique(transcript_info[,2:3]), + tx2gene = transcript_info[,1:2] + ) +} + +# Read sample/column data +read_coldata <- function(coldata_path){ + + if (file.exists(coldata_path)) { + coldata = read.csv(coldata_path, sep="\t") + coldata = coldata[match(names, coldata[,1]),] + coldata = cbind(files = fns, coldata) + } else { + message("ColData not available: ", coldata_path) + coldata = data.frame(files = fns, names = names) + } + + rownames(coldata) = coldata[["names"]] +} + +# Function to create a SummarizedExperiment object +create_summarized_experiment <- function(counts, abundance, length, col_data, row_data) { + return(SummarizedExperiment(assays = list(counts = counts, abundance = abundance, length = length), + colData = col_data, + rowData = row_data)) +} + +# Read the abundance values pattern <- ifelse(quant_type == "kallisto", "abundance.tsv", "quant.sf") fns = list.files(path, pattern = pattern, recursive = T, full.names = T) names = basename(dirname(fns)) names(fns) = names +dropInfReps = quant_type == "kallisto" +txi = tximport(fns, type = quant_type, txOut = TRUE, dropInfReps = dropInfReps) -if (file.exists(coldata)) { - coldata = read.csv(coldata, sep="\t") - coldata = coldata[match(names, coldata[,1]),] - coldata = cbind(files = fns, coldata) -} else { - message("ColData not available: ", coldata) - coldata = data.frame(files = fns, names = names) -} +# Read transcript metadata +transcript_info = read_transcript_info(tx2gene_path) -dropInfReps = quant_type == "kallisto" +# Read sample/ column data +coldata = read_coldata(coldata_path) -txi = tximport(fns, type = quant_type, txOut = TRUE, dropInfReps = dropInfReps) -rownames(coldata) = coldata[["names"]] -extra = setdiff(rownames(txi[[1]]), as.character(rowdata[["tx"]])) -if (length(extra) > 0) { - rowdata = rbind(rowdata, data.frame(tx=extra, gene_id=extra, gene_name=extra)) -} -rowdata = rowdata[match(rownames(txi[[1]]), as.character(rowdata[["tx"]])),] -rownames(rowdata) = rowdata[["tx"]] -se = SummarizedExperiment(assays = list(counts = txi[["counts"]], abundance = txi[["abundance"]], length = txi[["length"]]), - colData = DataFrame(coldata), - rowData = rowdata) -if (!is.null(tx2gene)) { - gi = summarizeToGene(txi, tx2gene = tx2gene) - gi.ls = summarizeToGene(txi, tx2gene = tx2gene, countsFromAbundance = "lengthScaledTPM") - gi.s = summarizeToGene(txi, tx2gene = tx2gene, countsFromAbundance = "scaledTPM") - growdata = unique(rowdata[,2:3]) - growdata = growdata[match(rownames(gi[[1]]), growdata[["gene_id"]]),] - rownames(growdata) = growdata[["tx"]] - gse = SummarizedExperiment(assays = list(counts = gi[["counts"]], abundance = gi[["abundance"]], length = gi[["length"]]), - colData = DataFrame(coldata), - rowData = growdata) - gse.ls = SummarizedExperiment(assays = list(counts = gi.ls[["counts"]], abundance = gi.ls[["abundance"]], length = gi.ls[["length"]]), - colData = DataFrame(coldata), - rowData = growdata) - gse.s = SummarizedExperiment(assays = list(counts = gi.s[["counts"]], abundance = gi.s[["abundance"]], length = gi.s[["length"]]), - colData = DataFrame(coldata), - rowData = growdata) -} +# Process input data to summarized experiment objects -build_table = function(se.obj, slot) { - cbind(rowData(se.obj)[,1:2], assays(se.obj)[[slot]]) -} +# Create initial SummarizedExperiment object +se <- create_summarized_experiment(txi[["counts"]], txi[["abundance"]], txi[["length"]], + DataFrame(coldata), transcript_info$transcript) + +# Parameters for writing tables +params <- list( + list(obj = se, slot = "abundance", suffix = "transcript_tpm.tsv"), + list(obj = se, slot = "counts", suffix = "transcript_counts.tsv"), + list(obj = se, slot = "length", suffix = "transcript_lengths.tsv") +) + +# Check if tx2gene is present in transcript_info +if ("tx2gene" %in% names(transcript_info) && !is.null(transcript_info$tx2gene)) { + tx2gene <- transcript_info$tx2gene + gi <- summarizeToGene(txi, tx2gene = tx2gene) + gi.ls <- summarizeToGene(txi, tx2gene = tx2gene, countsFromAbundance = "lengthScaledTPM") + gi.s <- summarizeToGene(txi, tx2gene = tx2gene, countsFromAbundance = "scaledTPM") + + # Get gene information matching the rows in gi + gene_info <- transcript_info$gene[match(rownames(gi[[1]]), transcript_info$gene[["gene_id"]]),] + rownames(gene_info) <- gene_info[["tx"]] + + # Create dataframe only once for reuse + col_data_frame <- DataFrame(coldata) -if(exists("gse")){ - write.table(build_table(gse, "abundance"), paste(c(prefix, "gene_tpm.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE) - write.table(build_table(gse, "counts"), paste(c(prefix, "gene_counts.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE) - write.table(build_table(gse.ls, "abundance"), paste(c(prefix, "gene_tpm_length_scaled.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE) - write.table(build_table(gse.ls, "counts"), paste(c(prefix, "gene_counts_length_scaled.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE) - write.table(build_table(gse.s, "abundance"), paste(c(prefix, "gene_tpm_scaled.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE) - write.table(build_table(gse.s, "counts"), paste(c(prefix, "gene_counts_scaled.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE) + # Create gene-level SummarizedExperiment objects + gse <- create_summarized_experiment(gi[["counts"]], gi[["abundance"]], gi[["length"]], + col_data_frame, gene_info) + gse.ls <- create_summarized_experiment(gi.ls[["counts"]], gi.ls[["abundance"]], gi.ls[["length"]], + col_data_frame, gene_info) + gse.s <- create_summarized_experiment(gi.s[["counts"]], gi.s[["abundance"]], gi.s[["length"]], + col_data_frame, gene_info) + params <- c(params, list( + list(obj = gse, slot = "length", suffix = "gene_lengths.tsv"), + list(obj = gse, slot = "abundance", suffix = "gene_tpm.tsv"), + list(obj = gse, slot = "counts", suffix = "gene_counts.tsv"), + list(obj = gse.ls, slot = "abundance", suffix = "gene_tpm_length_scaled.tsv"), + list(obj = gse.ls, slot = "counts", suffix = "gene_counts_length_scaled.tsv"), + list(obj = gse.s, slot = "abundance", suffix = "gene_tpm_scaled.tsv"), + list(obj = gse.s, slot = "counts", suffix = "gene_counts_scaled.tsv") + )) } -write.table(build_table(se, "abundance"), paste(c(prefix, "transcript_tpm.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE) -write.table(build_table(se, "counts"), paste(c(prefix, "transcript_counts.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE) +# Apply the write_se_table function to each set of parameters +lapply(params, write_se_table) # Print sessioninfo to standard out citation("tximeta") From 5f1826d4b837e7abfab8467baf9cc55ff06d4f4f Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Mon, 20 Nov 2023 13:18:47 +0000 Subject: [PATCH 02/10] Fix formatting etc in tximport script --- bin/tximport.r | 122 +++++++++++++++++++++++-------------------------- 1 file changed, 56 insertions(+), 66 deletions(-) mode change 100755 => 100644 bin/tximport.r diff --git a/bin/tximport.r b/bin/tximport.r old mode 100755 new mode 100644 index 75ff41ef4..e81e5a7e6 --- a/bin/tximport.r +++ b/bin/tximport.r @@ -1,127 +1,117 @@ #!/usr/bin/env Rscript +# Script for importing and processing transcript-level quantifications. # Written by Lorena Pantano and released under the MIT license. +# Loading required libraries library(SummarizedExperiment) library(tximport) -args = commandArgs(trailingOnly=TRUE) +# Parsing command line arguments +args <- commandArgs(trailingOnly=TRUE) if (length(args) < 4) { - stop("Usage: tximport.r ", call.=FALSE) + stop("Usage: tximport.r ", + call.=FALSE) } -coldata_path = args[1] -path = args[2] -prefix = args[3] -quant_type = args[4] -tx2gene_path = args[5] +# Assigning command line arguments to variables +coldata_path <- args[1] +path <- args[2] +prefix <- args[3] +quant_type <- args[4] +tx2gene_path <- args[5] ## Functions -# Function to build a table from a SummarizedExperiment object +# Build a table from a SummarizedExperiment object build_table <- function(se.obj, slot) { cbind(rowData(se.obj)[,1:2], assays(se.obj)[[slot]]) } -# Function to write a table to a file +# Write a table to a file with given parameters write_se_table <- function(params) { file_name <- paste0(prefix, ".", params$suffix) - write.table(build_table(params$obj, params$slot), file_name, sep="\t", quote=FALSE, row.names = FALSE) + write.table(build_table(params$obj, params$slot), file_name, + sep="\t", quote=FALSE, row.names = FALSE) } -# Read and process transcript meta +# Read transcript metadata from a given path read_transcript_info <- function(tinfo_path){ - - info = file.info(tinfo_path) + info <- file.info(tinfo_path) if (info$size == 0) { stop("tx2gene file is empty") } - # Read transcript information and set column names - transcript_info = read.csv( - tinfo_path, - sep="\t", - header = FALSE, - col.names = c("tx", "gene_id", "gene_name") - ) - - # Identify and add extra transcripts - extra = setdiff(rownames(txi[[1]]), as.character(transcript_info[["tx"]])) - transcript_info = rbind(transcript_info, data.frame(tx=extra, gene_id=extra, gene_name=extra)) - - # Reorder rows to match the order in txi and set row names - transcript_info = transcript_info[match(rownames(txi[[1]]), transcript_info[["tx"]]), ] - rownames(transcript_info) = transcript_info[["tx"]] - - # Return various formulations of the info for downstream use - list( - transcript = transcript_info, - gene = unique(transcript_info[,2:3]), - tx2gene = transcript_info[,1:2] - ) + transcript_info <- read.csv(tinfo_path, sep="\t", header = FALSE, + col.names = c("tx", "gene_id", "gene_name")) + + extra <- setdiff(rownames(txi[[1]]), as.character(transcript_info[["tx"]])) + transcript_info <- rbind(transcript_info, data.frame(tx=extra, gene_id=extra, gene_name=extra)) + transcript_info <- transcript_info[match(rownames(txi[[1]]), transcript_info[["tx"]]), ] + rownames(transcript_info) <- transcript_info[["tx"]] + + list(transcript = transcript_info, + gene = unique(transcript_info[,2:3]), + tx2gene = transcript_info[,1:2]) } -# Read sample/column data +# Read and process sample/column data from a given path read_coldata <- function(coldata_path){ - if (file.exists(coldata_path)) { - coldata = read.csv(coldata_path, sep="\t") - coldata = coldata[match(names, coldata[,1]),] - coldata = cbind(files = fns, coldata) + coldata <- read.csv(coldata_path, sep="\t") + coldata <- coldata[match(names, coldata[,1]),] + coldata <- cbind(files = fns, coldata) } else { message("ColData not available: ", coldata_path) - coldata = data.frame(files = fns, names = names) + coldata <- data.frame(files = fns, names = names) } - - rownames(coldata) = coldata[["names"]] + rownames(coldata) <- coldata[["names"]] } -# Function to create a SummarizedExperiment object +# Create a SummarizedExperiment object with given data create_summarized_experiment <- function(counts, abundance, length, col_data, row_data) { - return(SummarizedExperiment(assays = list(counts = counts, abundance = abundance, length = length), - colData = col_data, - rowData = row_data)) + SummarizedExperiment(assays = list(counts = counts, abundance = abundance, length = length), + colData = col_data, + rowData = row_data) } -# Read the abundance values -pattern <- ifelse(quant_type == "kallisto", "abundance.tsv", "quant.sf") -fns = list.files(path, pattern = pattern, recursive = T, full.names = T) -names = basename(dirname(fns)) -names(fns) = names -dropInfReps = quant_type == "kallisto" -txi = tximport(fns, type = quant_type, txOut = TRUE, dropInfReps = dropInfReps) +# Main script starts here -# Read transcript metadata -transcript_info = read_transcript_info(tx2gene_path) +# Define pattern for file names based on quantification type +pattern <- ifelse(quant_type == "kallisto", "abundance.tsv", "quant.sf") +fns <- list.files(path, pattern = pattern, recursive = T, full.names = T) +names <- basename(dirname(fns)) +names(fns) <- names +dropInfReps <- quant_type == "kallisto" -# Read sample/ column data -coldata = read_coldata(coldata_path) +# Import transcript-level quantifications +txi <- tximport(fns, type = quant_type, txOut = TRUE, dropInfReps = dropInfReps) -# Process input data to summarized experiment objects +# Read transcript and sample data +transcript_info <- read_transcript_info(tx2gene_path) +coldata <- read_coldata(coldata_path) # Create initial SummarizedExperiment object se <- create_summarized_experiment(txi[["counts"]], txi[["abundance"]], txi[["length"]], DataFrame(coldata), transcript_info$transcript) -# Parameters for writing tables +# Setting parameters for writing tables params <- list( list(obj = se, slot = "abundance", suffix = "transcript_tpm.tsv"), list(obj = se, slot = "counts", suffix = "transcript_counts.tsv"), list(obj = se, slot = "length", suffix = "transcript_lengths.tsv") ) -# Check if tx2gene is present in transcript_info +# Process gene-level data if tx2gene mapping is available if ("tx2gene" %in% names(transcript_info) && !is.null(transcript_info$tx2gene)) { tx2gene <- transcript_info$tx2gene gi <- summarizeToGene(txi, tx2gene = tx2gene) gi.ls <- summarizeToGene(txi, tx2gene = tx2gene, countsFromAbundance = "lengthScaledTPM") gi.s <- summarizeToGene(txi, tx2gene = tx2gene, countsFromAbundance = "scaledTPM") - # Get gene information matching the rows in gi gene_info <- transcript_info$gene[match(rownames(gi[[1]]), transcript_info$gene[["gene_id"]]),] rownames(gene_info) <- gene_info[["tx"]] - # Create dataframe only once for reuse col_data_frame <- DataFrame(coldata) # Create gene-level SummarizedExperiment objects @@ -142,10 +132,10 @@ if ("tx2gene" %in% names(transcript_info) && !is.null(transcript_info$tx2gene)) )) } -# Apply the write_se_table function to each set of parameters -lapply(params, write_se_table) +# Writing tables for each set of parameters +done <- lapply(params, write_se_table) -# Print sessioninfo to standard out +# Output session information and citations citation("tximeta") sessionInfo() From 489bcb4efdc7bd58839b22b0360d26b4d80b87a8 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Mon, 20 Nov 2023 13:43:44 +0000 Subject: [PATCH 03/10] update workflow files for new tximport script --- modules/local/tximport/main.nf | 2 ++ subworkflows/local/quantify_pseudo/main.nf | 14 ++++++++------ tower.yml | 12 ++++++++++++ 3 files changed, 22 insertions(+), 6 deletions(-) diff --git a/modules/local/tximport/main.nf b/modules/local/tximport/main.nf index 56d826a7d..cd97d139f 100644 --- a/modules/local/tximport/main.nf +++ b/modules/local/tximport/main.nf @@ -16,8 +16,10 @@ process TXIMPORT { path "*gene_counts.tsv" , emit: counts_gene path "*gene_counts_length_scaled.tsv", emit: counts_gene_length_scaled path "*gene_counts_scaled.tsv" , emit: counts_gene_scaled + path "*gene_lengths.tsv" , emit: lengths_gene path "*transcript_tpm.tsv" , emit: tpm_transcript path "*transcript_counts.tsv" , emit: counts_transcript + path "*transcript_lengths.tsv" , emit: lengths_transcript path "versions.yml" , emit: versions when: diff --git a/subworkflows/local/quantify_pseudo/main.nf b/subworkflows/local/quantify_pseudo/main.nf index 886b45e7b..bbb4a7185 100644 --- a/subworkflows/local/quantify_pseudo/main.nf +++ b/subworkflows/local/quantify_pseudo/main.nf @@ -79,12 +79,14 @@ workflow QUANTIFY_PSEUDO_ALIGNMENT { results = ch_pseudo_results // channel: [ val(meta), results_dir ] multiqc = ch_pseudo_multiqc // channel: [ val(meta), files_for_multiqc ] - tpm_gene = TXIMPORT.out.tpm_gene // channel: [ val(meta), counts ] - counts_gene = TXIMPORT.out.counts_gene // channel: [ val(meta), counts ] - counts_gene_length_scaled = TXIMPORT.out.counts_gene_length_scaled // channel: [ val(meta), counts ] - counts_gene_scaled = TXIMPORT.out.counts_gene_scaled // channel: [ val(meta), counts ] - tpm_transcript = TXIMPORT.out.tpm_transcript // channel: [ val(meta), counts ] - counts_transcript = TXIMPORT.out.counts_transcript // channel: [ val(meta), counts ] + tpm_gene = TXIMPORT.out.tpm_gene // path *gene_tpm.tsv + counts_gene = TXIMPORT.out.counts_gene // path *gene_counts.tsv + lengths_gene = TXIMPORT.out.lengths_gene // path *gene_lengths.tsv + counts_gene_length_scaled = TXIMPORT.out.counts_gene_length_scaled // path *gene_counts_length_scaled.tsv + counts_gene_scaled = TXIMPORT.out.counts_gene_scaled // path *gene_counts_scaled.tsv + tpm_transcript = TXIMPORT.out.tpm_transcript // path *gene_tpm.tsv + counts_transcript = TXIMPORT.out.counts_transcript // path *transcript_counts.tsv + lengths_transcript = TXIMPORT.out.lengths_transript // path *transcript_lengths.tsv merged_gene_rds = SE_GENE.out.rds // path: *.rds merged_gene_rds_length_scaled = SE_GENE_LENGTH_SCALED.out.rds // path: *.rds diff --git a/tower.yml b/tower.yml index 21f821a53..4079a8405 100644 --- a/tower.yml +++ b/tower.yml @@ -4,6 +4,10 @@ reports: "**/star_salmon/**/deseq2.plots.pdf": display: "All samples STAR Salmon DESeq2 QC PDF plots" "**/star_salmon/salmon.merged.gene_counts.tsv": + display: "All samples STAR Salmon mean transcript length per gene" + "**/star_salmon/salmon.merged.gene_lengths.tsv": + display: "All samples STAR Salmon transcript lengths" + "**/star_salmon/salmon.merged.transcript_lengths.tsv": display: "All samples STAR Salmon merged gene raw counts" "**/star_salmon/salmon.merged.gene_tpm.tsv": display: "All samples STAR Salmon merged gene TPM counts" @@ -14,6 +18,10 @@ reports: "**/salmon/**/deseq2.plots.pdf": display: "All samples Salmon DESeq2 QC PDF plots" "**/salmon/salmon.merged.gene_counts.tsv": + display: "All samples Salmon mean transcript length per gene" + "**/salmon/salmon.merged.gene_lengths.tsv": + display: "All samples Salmon transcript lengths" + "**/salmon/salmon.merged.transcript_lengths.tsv": display: "All samples Salmon merged gene raw counts" "**/salmon/salmon.merged.gene_tpm.tsv": display: "All samples Salmon merged gene TPM counts" @@ -24,6 +32,10 @@ reports: "**/kallisto/**/deseq2.plots.pdf": display: "All samples Kallisto DESeq2 QC PDF plots" "**/kallisto/kallisto.merged.gene_counts.tsv": + display: "All samples Kallisto mean transcript length per gene" + "**/kallisto/kallisto.merged.gene_lengths.tsv": + display: "All samples Kallisto transcript lengths" + "**/kallisto/kallisto.merged.transcript_lengths.tsv": display: "All samples Kallisto merged gene raw counts" "**/kallisto/kallisto.merged.gene_tpm.tsv": display: "All samples Kallisto merged gene TPM counts" From a9a831c67b04257a0fa139cfa1c9b66f55de1251 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Mon, 20 Nov 2023 13:47:54 +0000 Subject: [PATCH 04/10] appease eclint --- bin/tximport.r | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/bin/tximport.r b/bin/tximport.r index e81e5a7e6..bf7377f9f 100644 --- a/bin/tximport.r +++ b/bin/tximport.r @@ -10,8 +10,8 @@ library(tximport) # Parsing command line arguments args <- commandArgs(trailingOnly=TRUE) if (length(args) < 4) { - stop("Usage: tximport.r ", - call.=FALSE) + stop("Usage: tximport.r ", + call.=FALSE) } # Assigning command line arguments to variables @@ -21,7 +21,7 @@ prefix <- args[3] quant_type <- args[4] tx2gene_path <- args[5] -## Functions +## Functions # Build a table from a SummarizedExperiment object build_table <- function(se.obj, slot) { @@ -31,7 +31,7 @@ build_table <- function(se.obj, slot) { # Write a table to a file with given parameters write_se_table <- function(params) { file_name <- paste0(prefix, ".", params$suffix) - write.table(build_table(params$obj, params$slot), file_name, + write.table(build_table(params$obj, params$slot), file_name, sep="\t", quote=FALSE, row.names = FALSE) } @@ -40,9 +40,9 @@ read_transcript_info <- function(tinfo_path){ info <- file.info(tinfo_path) if (info$size == 0) { stop("tx2gene file is empty") - } + } - transcript_info <- read.csv(tinfo_path, sep="\t", header = FALSE, + transcript_info <- read.csv(tinfo_path, sep="\t", header = FALSE, col.names = c("tx", "gene_id", "gene_name")) extra <- setdiff(rownames(txi[[1]]), as.character(transcript_info[["tx"]])) @@ -51,8 +51,8 @@ read_transcript_info <- function(tinfo_path){ rownames(transcript_info) <- transcript_info[["tx"]] list(transcript = transcript_info, - gene = unique(transcript_info[,2:3]), - tx2gene = transcript_info[,1:2]) + gene = unique(transcript_info[,2:3]), + tx2gene = transcript_info[,1:2]) } # Read and process sample/column data from a given path @@ -71,8 +71,8 @@ read_coldata <- function(coldata_path){ # Create a SummarizedExperiment object with given data create_summarized_experiment <- function(counts, abundance, length, col_data, row_data) { SummarizedExperiment(assays = list(counts = counts, abundance = abundance, length = length), - colData = col_data, - rowData = row_data) + colData = col_data, + rowData = row_data) } # Main script starts here @@ -93,7 +93,7 @@ coldata <- read_coldata(coldata_path) # Create initial SummarizedExperiment object se <- create_summarized_experiment(txi[["counts"]], txi[["abundance"]], txi[["length"]], - DataFrame(coldata), transcript_info$transcript) + DataFrame(coldata), transcript_info$transcript) # Setting parameters for writing tables params <- list( @@ -116,11 +116,12 @@ if ("tx2gene" %in% names(transcript_info) && !is.null(transcript_info$tx2gene)) # Create gene-level SummarizedExperiment objects gse <- create_summarized_experiment(gi[["counts"]], gi[["abundance"]], gi[["length"]], - col_data_frame, gene_info) + col_data_frame, gene_info) gse.ls <- create_summarized_experiment(gi.ls[["counts"]], gi.ls[["abundance"]], gi.ls[["length"]], - col_data_frame, gene_info) + col_data_frame, gene_info) gse.s <- create_summarized_experiment(gi.s[["counts"]], gi.s[["abundance"]], gi.s[["length"]], - col_data_frame, gene_info) + col_data_frame, gene_info) + params <- c(params, list( list(obj = gse, slot = "length", suffix = "gene_lengths.tsv"), list(obj = gse, slot = "abundance", suffix = "gene_tpm.tsv"), From b7066620049a6df0be2e2108f678171c85de1006 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Mon, 20 Nov 2023 13:50:15 +0000 Subject: [PATCH 05/10] appease eclint --- bin/tximport.r | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/tximport.r b/bin/tximport.r index bf7377f9f..528d59e2f 100644 --- a/bin/tximport.r +++ b/bin/tximport.r @@ -51,8 +51,8 @@ read_transcript_info <- function(tinfo_path){ rownames(transcript_info) <- transcript_info[["tx"]] list(transcript = transcript_info, - gene = unique(transcript_info[,2:3]), - tx2gene = transcript_info[,1:2]) + gene = unique(transcript_info[,2:3]), + tx2gene = transcript_info[,1:2]) } # Read and process sample/column data from a given path From 6bc74b649398c9d2c6576b07879303d087fa9389 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Mon, 20 Nov 2023 15:07:34 +0000 Subject: [PATCH 06/10] restore script executability --- bin/tximport.r | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 bin/tximport.r diff --git a/bin/tximport.r b/bin/tximport.r old mode 100644 new mode 100755 From 3e30c263ef306235f78a7f5951af9b8e7bde9639 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Mon, 20 Nov 2023 15:20:18 +0000 Subject: [PATCH 07/10] correct typo --- subworkflows/local/quantify_pseudo/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/quantify_pseudo/main.nf b/subworkflows/local/quantify_pseudo/main.nf index bbb4a7185..7887f086f 100644 --- a/subworkflows/local/quantify_pseudo/main.nf +++ b/subworkflows/local/quantify_pseudo/main.nf @@ -86,7 +86,7 @@ workflow QUANTIFY_PSEUDO_ALIGNMENT { counts_gene_scaled = TXIMPORT.out.counts_gene_scaled // path *gene_counts_scaled.tsv tpm_transcript = TXIMPORT.out.tpm_transcript // path *gene_tpm.tsv counts_transcript = TXIMPORT.out.counts_transcript // path *transcript_counts.tsv - lengths_transcript = TXIMPORT.out.lengths_transript // path *transcript_lengths.tsv + lengths_transcript = TXIMPORT.out.lengths_transcript // path *transcript_lengths.tsv merged_gene_rds = SE_GENE.out.rds // path: *.rds merged_gene_rds_length_scaled = SE_GENE_LENGTH_SCALED.out.rds // path: *.rds From 736bffac8e015ccba163870aa4c6d741d8a456e1 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Mon, 20 Nov 2023 16:18:23 +0000 Subject: [PATCH 08/10] Fix tower yml --- tower.yml | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tower.yml b/tower.yml index 4079a8405..7ce7e2dff 100644 --- a/tower.yml +++ b/tower.yml @@ -4,11 +4,11 @@ reports: "**/star_salmon/**/deseq2.plots.pdf": display: "All samples STAR Salmon DESeq2 QC PDF plots" "**/star_salmon/salmon.merged.gene_counts.tsv": - display: "All samples STAR Salmon mean transcript length per gene" + display: "All samples STAR Salmon merged gene raw counts" "**/star_salmon/salmon.merged.gene_lengths.tsv": - display: "All samples STAR Salmon transcript lengths" + display: "All samples STAR Salmon mean transcript length per gene" "**/star_salmon/salmon.merged.transcript_lengths.tsv": - display: "All samples STAR Salmon merged gene raw counts" + display: "All samples STAR Salmon transcript lengths" "**/star_salmon/salmon.merged.gene_tpm.tsv": display: "All samples STAR Salmon merged gene TPM counts" "**/star_salmon/salmon.merged.transcript_counts.tsv": @@ -18,11 +18,11 @@ reports: "**/salmon/**/deseq2.plots.pdf": display: "All samples Salmon DESeq2 QC PDF plots" "**/salmon/salmon.merged.gene_counts.tsv": - display: "All samples Salmon mean transcript length per gene" + display: "All samples Salmon merged gene raw counts" "**/salmon/salmon.merged.gene_lengths.tsv": - display: "All samples Salmon transcript lengths" + display: "All samples Salmon mean transcript length per gene" "**/salmon/salmon.merged.transcript_lengths.tsv": - display: "All samples Salmon merged gene raw counts" + display: "All samples Salmon transcript lengths" "**/salmon/salmon.merged.gene_tpm.tsv": display: "All samples Salmon merged gene TPM counts" "**/salmon/salmon.merged.transcript_counts.tsv": @@ -32,11 +32,11 @@ reports: "**/kallisto/**/deseq2.plots.pdf": display: "All samples Kallisto DESeq2 QC PDF plots" "**/kallisto/kallisto.merged.gene_counts.tsv": - display: "All samples Kallisto mean transcript length per gene" + display: "All samples Kallisto merged gene raw counts" "**/kallisto/kallisto.merged.gene_lengths.tsv": - display: "All samples Kallisto transcript lengths" + display: "All samples Kallisto mean transcript length per gene" "**/kallisto/kallisto.merged.transcript_lengths.tsv": - display: "All samples Kallisto merged gene raw counts" + display: "All samples Kallisto transcript lengths" "**/kallisto/kallisto.merged.gene_tpm.tsv": display: "All samples Kallisto merged gene TPM counts" "**/kallisto/kallisto.merged.transcript_counts.tsv": From 1bad8d5c7b48f24a2e9a08b28c1dce48477dbc1b Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Tue, 21 Nov 2023 09:14:59 +0000 Subject: [PATCH 09/10] [skip ci] Add Jon to script credits --- bin/tximport.r | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/tximport.r b/bin/tximport.r index 528d59e2f..a540498d4 100755 --- a/bin/tximport.r +++ b/bin/tximport.r @@ -1,7 +1,7 @@ #!/usr/bin/env Rscript # Script for importing and processing transcript-level quantifications. -# Written by Lorena Pantano and released under the MIT license. +# Written by Lorena Pantano, later modified by Jonathan Manning, and released under the MIT license. # Loading required libraries library(SummarizedExperiment) From 56edb1cd70d9a0ebe067797acff75b690148418f Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Tue, 21 Nov 2023 09:17:24 +0000 Subject: [PATCH 10/10] actually don't skip ci --- bin/tximport.r | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bin/tximport.r b/bin/tximport.r index a540498d4..59a8fcd9e 100755 --- a/bin/tximport.r +++ b/bin/tximport.r @@ -1,7 +1,8 @@ #!/usr/bin/env Rscript # Script for importing and processing transcript-level quantifications. -# Written by Lorena Pantano, later modified by Jonathan Manning, and released under the MIT license. +# Written by Lorena Pantano, later modified by Jonathan Manning, and released +# under the MIT license. # Loading required libraries library(SummarizedExperiment)