Skip to content

Commit

Permalink
update repenrich tool (and repenrich-edger) (#676)
Browse files Browse the repository at this point in the history
* Update edgeR_repenrich.R

* style R code

* lint plots and update tests

* update test

* use macros for both tools

* Drop depracted Bio.Alphabet

* pipe and fix samtools command line

* simplify samtools management

* bedtools 2.31.1 is not compatible with repenrich python ! stay at 2.23.0

* Update .shed.yml
  • Loading branch information
drosofff authored Mar 9, 2024
1 parent 668f7e9 commit 11df9f0
Show file tree
Hide file tree
Showing 9 changed files with 348 additions and 374 deletions.
3 changes: 1 addition & 2 deletions tools/repenrich/.shed.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ categories:
- Transcriptomics
- Variant Analysis
homepage_url: https://github.com/nskvir/RepEnrich
remote_repository_url: https://github.com/ARTbio/tools-artbio/tree/master/tools/repenrich
remote_repository_url: https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich
toolshed:
- testtoolshed
- toolshed
3 changes: 1 addition & 2 deletions tools/repenrich/RepEnrich_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import sys

from Bio import SeqIO
from Bio.Alphabet import IUPAC
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

Expand Down Expand Up @@ -242,7 +241,7 @@ def import_text(filename, separator):
print("Unrecognised Chromosome: "+chr)
pass
# Convert metagenome to SeqRecord object (required by SeqIO.write)
record = SeqRecord(Seq(metagenome, IUPAC.unambiguous_dna), id="repname",
record = SeqRecord(Seq(metagenome), id="repname",
name="", description="")
print("saving repgenome " + newname + ".fa" + " (" + str(k) + " of "
+ str(nrepgenomes) + ")")
Expand Down
220 changes: 92 additions & 128 deletions tools/repenrich/edgeR_repenrich.R
Original file line number Diff line number Diff line change
@@ -1,228 +1,192 @@
#!/usr/bin/env Rscript

# A command-line interface to edgeR for use with Galaxy edger-repenrich
# written by Christophe Antoniewski [email protected] 2017.05.30


# setup R error handling to go to stderr
options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } )
options(show.error.messages = FALSE, error = function() {
cat(geterrmessage(), file = stderr())
q("no", 1, FALSE)
})

# To not crash galaxy with an UTF8 error with not-US LC settings.
loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")

# load libraries
library("getopt")
library("tools")
library("rjson")
suppressPackageStartupMessages({
library("edgeR")
library("limma")
})

options(stringAsFactors = FALSE, useFancyQuotes = FALSE)
args <- commandArgs(trailingOnly = TRUE)

# get options, using the spec as defined by the enclosed list.
# we read the options from the default: commandArgs(TRUE).
spec <- matrix(c(
"quiet", "q", 0, "logical",
"help", "h", 0, "logical",
"outfile", "o", 1, "character",
"countsfile", "n", 1, "character",
"factorName", "N", 1, "character",
"levelNameA", "A", 1, "character",
"levelNameB", "B", 1, "character",
"levelAfiles", "a", 1, "character",
"levelBfiles", "b", 1, "character",
"alignmentA", "i", 1, "character",
"alignmentB", "j", 1, "character",
"plots" , "p", 1, "character"),
byrow=TRUE, ncol=4)
spec <- matrix(
c(
"quiet", "q", 0, "logical",
"outfile", "o", 1, "character",
"countsfile", "n", 1, "character",
"factorName", "N", 1, "character",
"levelNameA", "A", 1, "character",
"levelNameB", "B", 1, "character",
"levelAfiles", "a", 1, "character",
"levelBfiles", "b", 1, "character",
"alignmentA", "i", 1, "character",
"alignmentB", "j", 1, "character",
"plots", "p", 1, "character"
),
byrow = TRUE, ncol = 4
)
opt <- getopt(spec)

# if help was asked for print a friendly message
# and exit with a non-zero error code
if (!is.null(opt$help)) {
cat(getopt(spec, usage=TRUE))
q(status=1)
}

# enforce the following required arguments
if (is.null(opt$outfile)) {
cat("'outfile' is required\n")
q(status=1)
}
if (is.null(opt$levelAfiles) | is.null(opt$levelBfiles)) {
cat("input count files are required for both levels\n")
q(status=1)
}
if (is.null(opt$alignmentA) | is.null(opt$alignmentB)) {
cat("total aligned read files are required for both levels\n")
q(status=1)
}

verbose <- if (is.null(opt$quiet)) {
TRUE
} else {
FALSE
}

suppressPackageStartupMessages({
library("edgeR")
library("limma")
})

# build levels A and B file lists

library("rjson")
filesA <- fromJSON(opt$levelAfiles, method = "C", unexpected.escape = "error")
filesB <- fromJSON(opt$levelBfiles, method = "C", unexpected.escape = "error")
listA <- list()
indice = 0
indice <- 0
listA[["level"]] <- opt$levelNameA
for (file in filesA) {
indice = indice +1
listA[[paste0(opt$levelNameA,"_",indice)]] <- read.delim(file, header=FALSE)
}
indice <- indice + 1
listA[[paste0(opt$levelNameA, "_", indice)]] <- read.delim(file, header = FALSE)
}
listB <- list()
indice = 0
indice <- 0
listB[["level"]] <- opt$levelNameB
for (file in filesB) {
indice = indice +1
listB[[paste0(opt$levelNameB,"_",indice)]] <- read.delim(file, header=FALSE)
}
indice <- indice + 1
listB[[paste0(opt$levelNameB, "_", indice)]] <- read.delim(file, header = FALSE)
}

# build a counts table
counts <- data.frame(row.names=listA[[2]][,1])
counts <- data.frame(row.names = listA[[2]][, 1])
for (element in names(listA[-1])) {
counts<-cbind(counts, listA[[element]][,4])
}
counts <- cbind(counts, listA[[element]][, 4])
}
for (element in names(listB[-1])) {
counts<-cbind(counts, listB[[element]][,4])
}
colnames(counts)=c(names(listA[-1]), names(listB[-1]))
counts <- cbind(counts, listB[[element]][, 4])
}
colnames(counts) <- c(names(listA[-1]), names(listB[-1]))

# build aligned counts vector

filesi <- fromJSON(opt$alignmentA, method = "C", unexpected.escape = "error")
filesj <- fromJSON(opt$alignmentB, method = "C", unexpected.escape = "error")
sizes <- c()
for (file in filesi) {
sizes <- c(sizes, read.delim(file, header=TRUE)[1,1])
}
sizes <- c(sizes, read.delim(file, header = TRUE)[1, 1])
}
for (file in filesj) {
sizes <- c(sizes, read.delim(file, header=TRUE)[1,1])
}
sizes <- c(sizes, read.delim(file, header = TRUE)[1, 1])
}

# build a meta data object

meta <- data.frame(
row.names=colnames(counts),
condition=c(rep(opt$levelNameA,length(filesA)), rep(opt$levelNameB,length(filesB)) ),
libsize=sizes
row.names = colnames(counts),
condition = c(rep(opt$levelNameA, length(filesA)), rep(opt$levelNameB, length(filesB))),
libsize = sizes
)


# Define the library size and conditions for the GLM
libsize <- meta$libsize
condition <- factor(meta$condition)
design <- model.matrix(~0+condition)
colnames(design) <- levels(meta$condition)

design <- model.matrix(~ 0 + condition)
colnames(design) <- levels(condition)

# Build a DGE object for the GLM
y <- DGEList(counts=counts, lib.size=libsize)
y <- DGEList(counts = counts, lib.size = libsize)

# Normalize the data
y <- calcNormFactors(y)
y$samples
# plotMDS(y) latter

# Estimate the variance
y <- estimateGLMCommonDisp(y, design)
y <- estimateGLMTrendedDisp(y, design)
y <- estimateGLMTagwiseDisp(y, design)
# plotBCV(y) latter

# Builds and outputs an object to contain the normalized read abundance in counts per million of reads
cpm <- cpm(y, log=FALSE, lib.size=libsize)
cpm <- cpm(y, log = FALSE, lib.size = libsize)
cpm <- as.data.frame(cpm)
colnames(cpm) <- colnames(counts)
if (!is.null(opt$countsfile)) {
normalizedAbundance <- data.frame(Tag=rownames(cpm))
normalizedAbundance <- data.frame(Tag = rownames(cpm))
normalizedAbundance <- cbind(normalizedAbundance, cpm)
write.table(normalizedAbundance, file=opt$countsfile, sep="\t", col.names=TRUE, row.names=FALSE, quote=FALSE)
write.table(normalizedAbundance, file = opt$countsfile, sep = "\t", col.names = TRUE, row.names = FALSE, quote = FALSE)
}

# Conduct fitting of the GLM
yfit <- glmFit(y, design)

# Initialize result matrices to contain the results of the GLM
results <- matrix(nrow=dim(counts)[1],ncol=0)
logfc <- matrix(nrow=dim(counts)[1],ncol=0)
results <- matrix(nrow = dim(counts)[1], ncol = 0)
logfc <- matrix(nrow = dim(counts)[1], ncol = 0)

# Make the comparisons for the GLM
my.contrasts <- makeContrasts(
paste0(opt$levelNameA,"_",opt$levelNameB," = ", opt$levelNameA, " - ", opt$levelNameB),
paste0(opt$levelNameA, "_", opt$levelNameB, " = ", opt$levelNameA, " - ", opt$levelNameB),
levels = design
)

# Define the contrasts used in the comparisons
allcontrasts = paste0(opt$levelNameA," vs ",opt$levelNameB)
allcontrasts <- paste0(opt$levelNameA, " vs ", opt$levelNameB)

# Conduct a for loop that will do the fitting of the GLM for each comparison
# Put the results into the results objects
lrt <- glmLRT(yfit, contrast=my.contrasts[,1])
plotSmear(lrt, de.tags=rownames(y))
title(allcontrasts)
res <- topTags(lrt,n=dim(c)[1],sort.by="none")$table
results <- cbind(results,res[,c(1,5)])
logfc <- cbind(logfc,res[c(1)])
lrt <- glmLRT(yfit, contrast = my.contrasts[, 1])
res <- topTags(lrt, n = dim(c)[1], sort.by = "none")$table
results <- cbind(results, res[, c(1, 5)])
logfc <- cbind(logfc, res[c(1)])

# Add the repeat types back into the results.
# We should still have the same order as the input data
results$class <- listA[[2]][,2]
results$type <- listA[[2]][,3]

results$class <- listA[[2]][, 2]
results$type <- listA[[2]][, 3]
# Sort the results table by the FDR
results <- results[with(results, order(FDR)), ]

# Save the results
write.table(results, opt$outfile, quote=FALSE, sep="\t", col.names=FALSE)

# Plot Fold Changes for repeat classes and types

# open the device and plots
if (!is.null(opt$plots)) {
if (verbose) cat("creating plots\n")
pdf(opt$plots)
plotMDS(y, main="Multidimensional Scaling Plot Of Distances Between Samples")
plotBCV(y, xlab="Gene abundance (Average log CPM)", main="Biological Coefficient of Variation Plot")
plotMDS(y, main = "Multidimensional Scaling Plot Of Distances Between Samples")
plotBCV(y, xlab = "Gene abundance (Average log CPM)", main = "Biological Coefficient of Variation Plot")
logFC <- results[, "logFC"]
# Plot the repeat classes
classes <- with(results, reorder(class, -logFC, median))
classes
par(mar=c(6,10,4,1))
boxplot(logFC ~ classes, data=results, outline=FALSE, horizontal=TRUE,
las=2, xlab="log2(Fold Change)", main=paste0(allcontrasts, ", by Class"))
abline(v=0)
par(mar = c(6, 10, 4, 1))
boxplot(logFC ~ classes,
data = results, outline = FALSE, horizontal = TRUE,
las = 2, xlab = "log2(Fold Change)", ylab = "", cex.axis = 0.7, main = paste0(allcontrasts, ", by Class")
)
abline(v = 0)
# Plot the repeat types
types <- with(results, reorder(type, -logFC, median))
boxplot(logFC ~ types, data=results, outline=FALSE, horizontal=TRUE,
las=2, xlab="log2(Fold Change)", main=paste0(allcontrasts, ", by Type"), yaxt="n")
axis(2, cex.axis=(1*28)/(length(levels(types))),
at=seq(from=1, to=length(levels(types))),
labels=levels(types), las=2)
abline(v=0)
boxplot(logFC ~ types,
data = results, outline = FALSE, horizontal = TRUE,
las = 2, xlab = "log2(Fold Change)", ylab = "", cex.axis = 0.7, main = paste0(allcontrasts, ", by Type")
)
abline(v = 0)
# volcano plot
TEdata = cbind(rownames(results), as.data.frame(results), score=-log(results$FDR, 10))
colnames(TEdata) = c("Tag","log2FC", "FDR", "Class", "Type", "score")
color = ifelse(TEdata$FDR<0.05, "red", "black")
s <- subset(TEdata, FDR<0.01)
with(TEdata, plot(log2FC, score, pch=20, col=color, main="Volcano plot (all tag types)", ylab="-log10(FDR)"))
text(s[,2], s[,6], labels = s[,5], pos = seq(from=1, to=3), cex=0.5)
TEdata <- cbind(rownames(results), as.data.frame(results), score = -log(results$FDR, 10))
colnames(TEdata) <- c("Tag", "log2FC", "FDR", "Class", "Type", "score")
color <- ifelse(TEdata$FDR < 0.05, "red", "black")
s <- subset(TEdata, FDR < 0.01)
with(TEdata, plot(log2FC, score, pch = 20, col = color, main = "Volcano plot (all tag types)", ylab = "-log10(FDR)"))
text(s[, 2], s[, 6], labels = s[, 5], pos = seq(from = 1, to = 3), cex = 0.5)
}

# close the plot device
if (!is.null(opt$plots)) {
cat("closing plot device\n")
dev.off()
cat("closing plot device\n")
dev.off()
}

cat("Session information:\n\n")
# Save the results
results <- cbind(TE_item = rownames(results), results)
colnames(results) <- c("TE_item", "log2FC", "FDR", "Class", "Type")
results$log2FC <- format(results$log2FC, digits = 5)
results$FDR <- format(results$FDR, digits = 5)
write.table(results, opt$outfile, quote = FALSE, sep = "\t", col.names = TRUE, row.names = FALSE)

cat("Session information:\n\n")
sessionInfo()

14 changes: 6 additions & 8 deletions tools/repenrich/edger-repenrich.xml
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
<tool id="edger-repenrich" name="edgeR-repenrich" version="1.5.2">
<tool id="edger-repenrich" name="edgeR-repenrich" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
<description>Determines differentially expressed features from RepEnrich counts</description>
<requirements>
<requirement type="package" version="3.16.5">bioconductor-edger</requirement>
<requirement type="package" version="3.30.13">bioconductor-limma</requirement>
<requirement type="package" version="1.20.0">r-getopt</requirement>
<requirement type="package" version="0.2.15">r-rjson</requirement>
</requirements>
<macros>
<import>macros.xml</import>
</macros>
<expand macro="requirements"/>
<stdio>
<regex match="Execution halted"
source="both"
Expand Down Expand Up @@ -109,7 +107,7 @@
</data>
</outputs>
<tests>
<test>
<test expect_num_outputs="3">
<param name="factorName" value="Genotype"/>
<param name="factorLevel_A" value="Mutant"/>
<param name="countsFiles_A" value="355_fraction_counts.tab,356_fraction_counts.tab"/>
Expand Down
18 changes: 18 additions & 0 deletions tools/repenrich/macros.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
<macros>
<token name="@TOOL_VERSION@">1.83</token>
<token name="@VERSION_SUFFIX@">0</token>
<token name="@PROFILE@">23.0</token>

<xml name="requirements">
<requirements>
<requirement type="package" version="4.0.2">bioconductor-edger</requirement>
<requirement type="package" version="3.58.1">bioconductor-limma</requirement>
<requirement type="package" version="1.20.4">r-getopt</requirement>
<requirement type="package" version="0.2.21">r-rjson</requirement>
<requirement type="package" version="1.0.0">bowtie</requirement>
<requirement type="package" version="1.19.2">samtools</requirement>
<requirement type="package" version="2.23.0">bedtools</requirement>
<requirement type="package" version="@TOOL_VERSION@">biopython</requirement>
</requirements>
</xml>
</macros>
Loading

0 comments on commit 11df9f0

Please sign in to comment.