Skip to content

Commit

Permalink
Merge pull request nf-core#1123 from nf-core/improve_tximport
Browse files Browse the repository at this point in the history
Overhaul tximport.r, output length tables
  • Loading branch information
pinin4fjords authored Nov 21, 2023
2 parents 76464e4 + 56edb1c commit 79443ec
Show file tree
Hide file tree
Showing 4 changed files with 140 additions and 75 deletions.
187 changes: 118 additions & 69 deletions bin/tximport.r
Original file line number Diff line number Diff line change
@@ -1,94 +1,143 @@
#!/usr/bin/env Rscript

# Written by Lorena Pantano and released under the MIT license.
# Script for importing and processing transcript-level quantifications.
# Written by Lorena Pantano, later modified by Jonathan Manning, and released
# under the MIT license.

# Loading required libraries
library(SummarizedExperiment)
library(tximport)

args = commandArgs(trailingOnly=TRUE)
# Parsing command line arguments
args <- commandArgs(trailingOnly=TRUE)
if (length(args) < 4) {
stop("Usage: tximport.r <coldata> <path> <sample_name> <quant_type> <tx2gene_path>", call.=FALSE)
stop("Usage: tximport.r <coldata_path> <path> <prefix> <quant_type> <tx2gene_path>",
call.=FALSE)
}

coldata = args[1]
path = args[2]
sample_name = args[3]
quant_type = args[4]
tx2gene_path = args[5]

prefix = sample_name

info = file.info(tx2gene_path)
if (info$size == 0) {
tx2gene = NULL
} else {
rowdata = read.csv(tx2gene_path, sep="\t", header = FALSE)
colnames(rowdata) = c("tx", "gene_id", "gene_name")
tx2gene = rowdata[,1:2]
# Assigning command line arguments to variables
coldata_path <- args[1]
path <- args[2]
prefix <- args[3]
quant_type <- args[4]
tx2gene_path <- args[5]

## Functions

# Build a table from a SummarizedExperiment object
build_table <- function(se.obj, slot) {
cbind(rowData(se.obj)[,1:2], assays(se.obj)[[slot]])
}

pattern <- ifelse(quant_type == "kallisto", "abundance.tsv", "quant.sf")
fns = list.files(path, pattern = pattern, recursive = T, full.names = T)
names = basename(dirname(fns))
names(fns) = names

if (file.exists(coldata)) {
coldata = read.csv(coldata, sep="\t")
coldata = coldata[match(names, coldata[,1]),]
coldata = cbind(files = fns, coldata)
} else {
message("ColData not available: ", coldata)
coldata = data.frame(files = fns, names = names)
# Write a table to a file with given parameters
write_se_table <- function(params) {
file_name <- paste0(prefix, ".", params$suffix)
write.table(build_table(params$obj, params$slot), file_name,
sep="\t", quote=FALSE, row.names = FALSE)
}

dropInfReps = quant_type == "kallisto"
# Read transcript metadata from a given path
read_transcript_info <- function(tinfo_path){
info <- file.info(tinfo_path)
if (info$size == 0) {
stop("tx2gene file is empty")
}

transcript_info <- read.csv(tinfo_path, sep="\t", header = FALSE,
col.names = c("tx", "gene_id", "gene_name"))

txi = tximport(fns, type = quant_type, txOut = TRUE, dropInfReps = dropInfReps)
rownames(coldata) = coldata[["names"]]
extra = setdiff(rownames(txi[[1]]), as.character(rowdata[["tx"]]))
if (length(extra) > 0) {
rowdata = rbind(rowdata, data.frame(tx=extra, gene_id=extra, gene_name=extra))
extra <- setdiff(rownames(txi[[1]]), as.character(transcript_info[["tx"]]))
transcript_info <- rbind(transcript_info, data.frame(tx=extra, gene_id=extra, gene_name=extra))
transcript_info <- transcript_info[match(rownames(txi[[1]]), transcript_info[["tx"]]), ]
rownames(transcript_info) <- transcript_info[["tx"]]

list(transcript = transcript_info,
gene = unique(transcript_info[,2:3]),
tx2gene = transcript_info[,1:2])
}
rowdata = rowdata[match(rownames(txi[[1]]), as.character(rowdata[["tx"]])),]
rownames(rowdata) = rowdata[["tx"]]
se = SummarizedExperiment(assays = list(counts = txi[["counts"]], abundance = txi[["abundance"]], length = txi[["length"]]),
colData = DataFrame(coldata),
rowData = rowdata)
if (!is.null(tx2gene)) {
gi = summarizeToGene(txi, tx2gene = tx2gene)
gi.ls = summarizeToGene(txi, tx2gene = tx2gene, countsFromAbundance = "lengthScaledTPM")
gi.s = summarizeToGene(txi, tx2gene = tx2gene, countsFromAbundance = "scaledTPM")
growdata = unique(rowdata[,2:3])
growdata = growdata[match(rownames(gi[[1]]), growdata[["gene_id"]]),]
rownames(growdata) = growdata[["tx"]]
gse = SummarizedExperiment(assays = list(counts = gi[["counts"]], abundance = gi[["abundance"]], length = gi[["length"]]),
colData = DataFrame(coldata),
rowData = growdata)
gse.ls = SummarizedExperiment(assays = list(counts = gi.ls[["counts"]], abundance = gi.ls[["abundance"]], length = gi.ls[["length"]]),
colData = DataFrame(coldata),
rowData = growdata)
gse.s = SummarizedExperiment(assays = list(counts = gi.s[["counts"]], abundance = gi.s[["abundance"]], length = gi.s[["length"]]),
colData = DataFrame(coldata),
rowData = growdata)

# Read and process sample/column data from a given path
read_coldata <- function(coldata_path){
if (file.exists(coldata_path)) {
coldata <- read.csv(coldata_path, sep="\t")
coldata <- coldata[match(names, coldata[,1]),]
coldata <- cbind(files = fns, coldata)
} else {
message("ColData not available: ", coldata_path)
coldata <- data.frame(files = fns, names = names)
}
rownames(coldata) <- coldata[["names"]]
}

build_table = function(se.obj, slot) {
cbind(rowData(se.obj)[,1:2], assays(se.obj)[[slot]])
# Create a SummarizedExperiment object with given data
create_summarized_experiment <- function(counts, abundance, length, col_data, row_data) {
SummarizedExperiment(assays = list(counts = counts, abundance = abundance, length = length),
colData = col_data,
rowData = row_data)
}

if(exists("gse")){
write.table(build_table(gse, "abundance"), paste(c(prefix, "gene_tpm.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE)
write.table(build_table(gse, "counts"), paste(c(prefix, "gene_counts.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE)
write.table(build_table(gse.ls, "abundance"), paste(c(prefix, "gene_tpm_length_scaled.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE)
write.table(build_table(gse.ls, "counts"), paste(c(prefix, "gene_counts_length_scaled.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE)
write.table(build_table(gse.s, "abundance"), paste(c(prefix, "gene_tpm_scaled.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE)
write.table(build_table(gse.s, "counts"), paste(c(prefix, "gene_counts_scaled.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE)
# Main script starts here

# Define pattern for file names based on quantification type
pattern <- ifelse(quant_type == "kallisto", "abundance.tsv", "quant.sf")
fns <- list.files(path, pattern = pattern, recursive = T, full.names = T)
names <- basename(dirname(fns))
names(fns) <- names
dropInfReps <- quant_type == "kallisto"

# Import transcript-level quantifications
txi <- tximport(fns, type = quant_type, txOut = TRUE, dropInfReps = dropInfReps)

# Read transcript and sample data
transcript_info <- read_transcript_info(tx2gene_path)
coldata <- read_coldata(coldata_path)

# Create initial SummarizedExperiment object
se <- create_summarized_experiment(txi[["counts"]], txi[["abundance"]], txi[["length"]],
DataFrame(coldata), transcript_info$transcript)

# Setting parameters for writing tables
params <- list(
list(obj = se, slot = "abundance", suffix = "transcript_tpm.tsv"),
list(obj = se, slot = "counts", suffix = "transcript_counts.tsv"),
list(obj = se, slot = "length", suffix = "transcript_lengths.tsv")
)

# Process gene-level data if tx2gene mapping is available
if ("tx2gene" %in% names(transcript_info) && !is.null(transcript_info$tx2gene)) {
tx2gene <- transcript_info$tx2gene
gi <- summarizeToGene(txi, tx2gene = tx2gene)
gi.ls <- summarizeToGene(txi, tx2gene = tx2gene, countsFromAbundance = "lengthScaledTPM")
gi.s <- summarizeToGene(txi, tx2gene = tx2gene, countsFromAbundance = "scaledTPM")

gene_info <- transcript_info$gene[match(rownames(gi[[1]]), transcript_info$gene[["gene_id"]]),]
rownames(gene_info) <- gene_info[["tx"]]

col_data_frame <- DataFrame(coldata)

# Create gene-level SummarizedExperiment objects
gse <- create_summarized_experiment(gi[["counts"]], gi[["abundance"]], gi[["length"]],
col_data_frame, gene_info)
gse.ls <- create_summarized_experiment(gi.ls[["counts"]], gi.ls[["abundance"]], gi.ls[["length"]],
col_data_frame, gene_info)
gse.s <- create_summarized_experiment(gi.s[["counts"]], gi.s[["abundance"]], gi.s[["length"]],
col_data_frame, gene_info)

params <- c(params, list(
list(obj = gse, slot = "length", suffix = "gene_lengths.tsv"),
list(obj = gse, slot = "abundance", suffix = "gene_tpm.tsv"),
list(obj = gse, slot = "counts", suffix = "gene_counts.tsv"),
list(obj = gse.ls, slot = "abundance", suffix = "gene_tpm_length_scaled.tsv"),
list(obj = gse.ls, slot = "counts", suffix = "gene_counts_length_scaled.tsv"),
list(obj = gse.s, slot = "abundance", suffix = "gene_tpm_scaled.tsv"),
list(obj = gse.s, slot = "counts", suffix = "gene_counts_scaled.tsv")
))
}

write.table(build_table(se, "abundance"), paste(c(prefix, "transcript_tpm.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE)
write.table(build_table(se, "counts"), paste(c(prefix, "transcript_counts.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE)
# Writing tables for each set of parameters
done <- lapply(params, write_se_table)

# Print sessioninfo to standard out
# Output session information and citations
citation("tximeta")
sessionInfo()

2 changes: 2 additions & 0 deletions modules/local/tximport/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,10 @@ process TXIMPORT {
path "*gene_counts.tsv" , emit: counts_gene
path "*gene_counts_length_scaled.tsv", emit: counts_gene_length_scaled
path "*gene_counts_scaled.tsv" , emit: counts_gene_scaled
path "*gene_lengths.tsv" , emit: lengths_gene
path "*transcript_tpm.tsv" , emit: tpm_transcript
path "*transcript_counts.tsv" , emit: counts_transcript
path "*transcript_lengths.tsv" , emit: lengths_transcript
path "versions.yml" , emit: versions

when:
Expand Down
14 changes: 8 additions & 6 deletions subworkflows/local/quantify_pseudo/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -79,12 +79,14 @@ workflow QUANTIFY_PSEUDO_ALIGNMENT {
results = ch_pseudo_results // channel: [ val(meta), results_dir ]
multiqc = ch_pseudo_multiqc // channel: [ val(meta), files_for_multiqc ]

tpm_gene = TXIMPORT.out.tpm_gene // channel: [ val(meta), counts ]
counts_gene = TXIMPORT.out.counts_gene // channel: [ val(meta), counts ]
counts_gene_length_scaled = TXIMPORT.out.counts_gene_length_scaled // channel: [ val(meta), counts ]
counts_gene_scaled = TXIMPORT.out.counts_gene_scaled // channel: [ val(meta), counts ]
tpm_transcript = TXIMPORT.out.tpm_transcript // channel: [ val(meta), counts ]
counts_transcript = TXIMPORT.out.counts_transcript // channel: [ val(meta), counts ]
tpm_gene = TXIMPORT.out.tpm_gene // path *gene_tpm.tsv
counts_gene = TXIMPORT.out.counts_gene // path *gene_counts.tsv
lengths_gene = TXIMPORT.out.lengths_gene // path *gene_lengths.tsv
counts_gene_length_scaled = TXIMPORT.out.counts_gene_length_scaled // path *gene_counts_length_scaled.tsv
counts_gene_scaled = TXIMPORT.out.counts_gene_scaled // path *gene_counts_scaled.tsv
tpm_transcript = TXIMPORT.out.tpm_transcript // path *gene_tpm.tsv
counts_transcript = TXIMPORT.out.counts_transcript // path *transcript_counts.tsv
lengths_transcript = TXIMPORT.out.lengths_transcript // path *transcript_lengths.tsv

merged_gene_rds = SE_GENE.out.rds // path: *.rds
merged_gene_rds_length_scaled = SE_GENE_LENGTH_SCALED.out.rds // path: *.rds
Expand Down
12 changes: 12 additions & 0 deletions tower.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@ reports:
display: "All samples STAR Salmon DESeq2 QC PDF plots"
"**/star_salmon/salmon.merged.gene_counts.tsv":
display: "All samples STAR Salmon merged gene raw counts"
"**/star_salmon/salmon.merged.gene_lengths.tsv":
display: "All samples STAR Salmon mean transcript length per gene"
"**/star_salmon/salmon.merged.transcript_lengths.tsv":
display: "All samples STAR Salmon transcript lengths"
"**/star_salmon/salmon.merged.gene_tpm.tsv":
display: "All samples STAR Salmon merged gene TPM counts"
"**/star_salmon/salmon.merged.transcript_counts.tsv":
Expand All @@ -15,6 +19,10 @@ reports:
display: "All samples Salmon DESeq2 QC PDF plots"
"**/salmon/salmon.merged.gene_counts.tsv":
display: "All samples Salmon merged gene raw counts"
"**/salmon/salmon.merged.gene_lengths.tsv":
display: "All samples Salmon mean transcript length per gene"
"**/salmon/salmon.merged.transcript_lengths.tsv":
display: "All samples Salmon transcript lengths"
"**/salmon/salmon.merged.gene_tpm.tsv":
display: "All samples Salmon merged gene TPM counts"
"**/salmon/salmon.merged.transcript_counts.tsv":
Expand All @@ -25,6 +33,10 @@ reports:
display: "All samples Kallisto DESeq2 QC PDF plots"
"**/kallisto/kallisto.merged.gene_counts.tsv":
display: "All samples Kallisto merged gene raw counts"
"**/kallisto/kallisto.merged.gene_lengths.tsv":
display: "All samples Kallisto mean transcript length per gene"
"**/kallisto/kallisto.merged.transcript_lengths.tsv":
display: "All samples Kallisto transcript lengths"
"**/kallisto/kallisto.merged.gene_tpm.tsv":
display: "All samples Kallisto merged gene TPM counts"
"**/kallisto/kallisto.merged.transcript_counts.tsv":
Expand Down

0 comments on commit 79443ec

Please sign in to comment.