nf-core · blazejszczerba · Dec 10, 2023 · Dec 13, 2023 · Dec 23, 2023 · Dec 23, 2023
diff --git a/.gitignore b/.gitignore
@@ -5,3 +5,6 @@ results/
 testing/
 testing*
 *.pyc
+.nf-test/
+test/
+.nf-test.log
diff --git a/README.md b/README.md
@@ -90,3 +90,5 @@ You can cite the `nf-core` publication as follows:
 > Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.
 >
 > _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).
+
+nextflow run main.nf -profile test_hackathon,docker --outdir=test
diff --git a/bin/calculate_TPM_HTSeq.R b/bin/calculate_TPM_HTSeq.R
@@ -0,0 +1,104 @@
+#!/usr/bin/env Rscript
+
+#-------------------------
+#
+# Description: 
+# Script to count TPM values for HTSeq quantification results
+#
+# Created by B. Mika-Gospodorz
+# Date: 29th May 2020
+#
+# Input args:   [1] = quantification_results_uniquely_mapped.tsv - HTSeq quantification results
+#			    [2] = host gene attribute used for quantification, e.g. 'gene_id'
+#				[3] =  pathogen gff file with gene features (from 3rd col) replaced with 'quant'
+#				[4] =  host gff file with gene features (from 3rd col) replaced with 'quant'
+# Output file: quantification_results_uniquely_mapped_NumReads_TPM.tsv
+#
+#-------------------------
+
+library(plyr)
+library(rtracklayer)
+
+
+args = commandArgs(trailingOnly=TRUE)
+# read quantification table
+gene_attribute <- args[2]
+table_htseq <- read.table(args[1], sep="\t", stringsAsFactors=F, header = T,row.names = gene_attribute) 
+pathogen_gff <- import(args[3])
+host_gff <- import(args[4])
+
+
+# function to extract gene length from gff file
+extract_transcript_length <- function(x,host_gff,gene_attribute){
+  #find annotations that contain host transcripts
+  h_tr <- host_gff[mcols(host_gff)[gene_attribute][[1]]==x]
+  # Filter quant features
+  quants <- h_tr[h_tr$type == "quant",]
+  # merge overlapping features, so that the same bases are covered by reduced collection of features
+  t_length <- sum(width(reduce(quants)))
+  return(t_length)
+}
+
+######################### extract gene length #######################################
+
+### pathogen
+names_gff_pathogen <- mcols(pathogen_gff)[gene_attribute][[1]]  # extract gene names from gff file
+common_pathogen = intersect(rownames(table_htseq), names_gff_pathogen) # find positions of pathogen genes in quantification table
+pathogen_table_htseq <- table_htseq[common_pathogen,] # extract pathogen quantification results
+pathogen_gff_match <- match(common_pathogen,names_gff_pathogen) # find positions of corresponding genes in gff file
+pathogen_table_htseq <- cbind(length = pathogen_gff@ranges@width[pathogen_gff_match],pathogen_table_htseq) # extract gene length from gff file and combine it with quantification table
+
+
+### host
+names_gff_host <- mcols(host_gff)[gene_attribute][[1]] # extract gene names from gff file
+lack_of_attribute <- which(is.na(names_gff_host)) #find positions without gene_attribute (eg.genes that don't have transcript_id attribute)
+if (length(lack_of_attribute)> 0) {
+  host_gff <- host_gff[-lack_of_attribute] #remove positions without gene_attribute 
+  names_gff_host <- names_gff_host[-lack_of_attribute] # extract gene names that contain gene_attribute
+}
+common_host = intersect(rownames(table_htseq),names_gff_host) # find positions of host genes in quantification table
+host_table_htseq <- table_htseq[common_host,] #extract quantification results of host genes 
+transcript_length <- sapply(common_host, extract_transcript_length, host_gff=host_gff,gene_attribute = gene_attribute,simplify = TRUE) #extract gene length
+host_table_htseq <- cbind(length = transcript_length,host_table_htseq) # add gene length into quantification table
+
+
+# combine host and pathogen tables
+htseq_table_with_gene_length = rbind(pathogen_table_htseq,host_table_htseq)
+
+
+######################### calculate TPM values ####################################### 
+
+# function to calculate TPMs
+tpm <- function(counts, lengths) {
+  rate <- counts / lengths
+  return(rate / sum(rate) * 1e6)
+}
+
+# function to add _TPM suffix
+rename_add_TPM <- function(x) {
+  paste(x,"_TPM",sep='')
+}
+
+# function to add _NumReads suffix
+rename_add_NumReads <- function(x) {
+  paste(x,"_NumReads",sep='')
+}
+
+
+# calculate TPMs for each gene in each sample
+TPMs <- apply(htseq_table_with_gene_length[2:dim(htseq_table_with_gene_length)[2]], 2, function(x) tpm(x, htseq_table_with_gene_length$length))
+colnames(TPMs) <- colnames(htseq_table_with_gene_length[,-1])
+# add TPM suffix to column names with TPM values
+colnames(TPMs) <-sapply(colnames(TPMs),function(x) rename_add_TPM(x))
+# add NumReads suffix to column names with NumReads values
+colnames_NR <- sapply(colnames(htseq_table_with_gene_length[,-1]),function(x) rename_add_NumReads(x))
+# add 'length' name for column which stores gene length
+colnames(htseq_table_with_gene_length) <- (c('length',colnames_NR))
+
+# combine results
+quant_results <- cbind(name = rownames(TPMs), htseq_table_with_gene_length,TPMs)
+names(quant_results)[1] <- gene_attribute
+
+# save results
+write.table(quant_results,file = "quantification_results_uniquely_mapped_NumReads_TPM.tsv",sep = "\t", row.names = F, quote = F)
+
diff --git a/bin/collect_quantification_data.py b/bin/collect_quantification_data.py
@@ -0,0 +1,115 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Jul  4 15:00:57 2019
+
+@author: B. Mika-Gospodorz
+
+Input file: list of quantification tables
+Output files: quantification_stats_*.tsv, quantification_results_*.tsv (HTSeq option) or
+Description: Used to merge quantification results from all samples
+"""
+
+import argparse
+
+import pandas as pd
+
+
+# function to merge HTSeq quantification results
+def collect_quantification_data_HTseq(input_files, profile, gene_attribute):
+    # initiate merged quantification data frame
+    quant_merged_table = pd.DataFrame()
+    for input_file in input_files:  # iterate over sample results
+        # read quantification data
+        quant_table = pd.read_csv(input_file, sep="\t", header=0)
+        if input_file == input_files[0]:  # initiate first column of quant_merged_table data frame
+            quant_merged_table = quant_table
+        else:  # extend quant_merged_table data frame with results of new sample
+            quant_merged_table = pd.concat([quant_merged_table, quant_table], axis=1)
+    quant_merged_table.index.names = [gene_attribute]
+    # sort quant_merged_table by column labels
+    quant_merged_table = quant_merged_table.sort_index(axis=1)
+
+    # extract last 5 rows of HTSeq quantification table that contain statistics and save results
+    alignment_stats = quant_merged_table["__no_feature":"__alignment_not_unique"]
+    alignment_stats.to_csv("quantification_stats_" + profile + ".tsv", sep="\t")
+    # remove statistics from quantification results and save quant_merged_table
+    quant_merged_table = quant_merged_table.drop(
+        ["__no_feature", "__ambiguous", "__too_low_aQual", "__not_aligned", "__alignment_not_unique"]
+    )
+    quant_merged_table.to_csv("quantification_results_" + profile + ".tsv", sep="\t")
+
+
+# function to merge Salmon quantification results
+def collect_quantification_data_salmon(input_files, gene_attribute, organism):
+    for input_file in input_files:  # iterate over sample results
+        if organism == "host_gene_level":  # read tximport results
+            quant_table = pd.read_csv(input_file, sep="\t", header=0, index_col=0)
+        else:
+            if organism == "both":  # read quantification table containing both host and pathogen transcripts
+                quant_table = pd.read_csv(
+                    input_file + "/quant.sf",
+                    sep="\t",
+                    header=0,
+                    index_col=0,
+                    dtype={"Name": str, "Length": int, "EffectiveLength": float, "TPM": float, "NumReads": float},
+                )
+            elif organism == "pathogen":  # read quantification table containing pathogen transcripts
+                quant_table = pd.read_csv(
+                    input_file + "/pathogen_quant.sf",
+                    sep="\t",
+                    header=0,
+                    index_col=0,
+                    dtype={"Name": str, "Length": int, "EffectiveLength": float, "TPM": float, "NumReads": float},
+                )
+            elif organism == "host":  # read quantification table containing host transcripts
+                quant_table = pd.read_csv(
+                    input_file + "/host_quant.sf",
+                    sep="\t",
+                    header=0,
+                    index_col=0,
+                    dtype={"Name": str, "Length": int, "EffectiveLength": float, "TPM": float, "NumReads": float},
+                )
+            # create new column names that keep sample name
+            name_file = input_file.split("/")
+            new_col = [name_file[0] + "_" + column for column in quant_table.columns]
+            quant_table.columns = new_col
+        if input_file == input_files[0]:  # initiate first column
+            quant_merged_table = quant_table
+        else:  # extend quant_merged_table data frame with results of new sample
+            quant_merged_table = pd.concat([quant_merged_table, quant_table], axis=1)
+    quant_merged_table.index.names = [gene_attribute]
+    # sort quant_merged_table by column labels
+    quant_merged_table = quant_merged_table.sort_index(axis=1)
+    # save results
+    if organism == "both":
+        quant_merged_table.to_csv("combined_quant.tsv", sep="\t")
+    elif organism == "pathogen":
+        quant_merged_table.to_csv("pathogen_combined_quant.tsv", sep="\t")
+    elif organism == "host":
+        quant_merged_table.to_csv("host_combined_quant.tsv", sep="\t")
+    elif organism == "host_gene_level":
+        quant_merged_table.to_csv("host_combined_gene_level.tsv", sep="\t")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="""Combine counts from all samples""")
+    parser.add_argument(
+        "-i", "--input_files", metavar="<input_files>", nargs="+", help="Path to quantification results "
+    )
+    parser.add_argument("-q", "--quantifier", metavar="<quantifier>", help="name of quantifier")
+    parser.add_argument("-a", "--gene_attribute", metavar="<gene_attribute>", help="gene attribute")
+    parser.add_argument(
+        "-org",
+        "--organism",
+        metavar="<organism>",
+        help="host, pathogen, both, host_gene_level - option avaiable for Salmon",
+    )
+
+    args = parser.parse_args()
+
+    # collect either Salmon or HTSeq quantification results
+    if args.quantifier == "htseq":
+        collect_quantification_data_HTseq(args.input_files, args.quantifier, args.gene_attribute)
+    if args.quantifier == "salmon":
+        collect_quantification_data_salmon(args.input_files, args.gene_attribute, args.organism)
diff --git a/bin/collect_total_raw_read_pairs.py b/bin/collect_total_raw_read_pairs.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Jul 23 14:45:52 2020
+
+@author: B. Mika-Gospodorz
+
+Input file: tsv file containing number of reads in fastq files created with count_total_reads.sh
+Output file: total_raw_read_pairs_fastq.tsv file that contains number of raw read pairs in each sample
+Description: Used to collect number of total raw read paires in each sample
+"""
+
+import argparse
+
+import pandas as pd
+
+parser = argparse.ArgumentParser(description="""collect total raw read paires""")
+parser.add_argument(
+    "-i", "--input_files", metavar="<input_files>", default="*.tsv", help="Path to the total_raw_reads_fastq.tsv"
+)
+args = parser.parse_args()
+
+
+# read total_raw_reads_fastq.tsv file
+df_stats = pd.read_csv(args.input_files, sep="\t", index_col=0, header=None)
+
+# extract fastq file names
+index_names = df_stats.index
+
+# remove '_1' and '_2' suffixes from fastq file names
+sample_names = [name.rsplit("_", 1)[0] for name in index_names]
+
+# define new sample names and remove duplicated results (each sample contains results from both *_1 and *_2 fastq files
+df_stats.index = sample_names
+df_stats = df_stats.reset_index().drop_duplicates(subset="index", keep="last").set_index("index")
+
+# save results
+df_stats.to_csv("total_raw_read_pairs_fastq.tsv", sep="\t", header=False)
diff --git a/bin/combine_quant_annotations.py b/bin/combine_quant_annotations.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Apr  9 12:07:56 2020
+
+@author: B.Mika-Gospdoorz
+
+Input files: .tsv quantification table with combined results from all samples
+            .tsv file with annotations extracted from gff using extract_annotations_from_gff.py
+Output file: *combined_quant_gene_level_annotations.tsv with gene annotations and quantification results
+Description: Used to combine annotations with quantification results
+"""
+
+import argparse
+
+import pandas as pd
+
+
+# function to combine annotations with quantification results
+def combine_annotations_quant(quantification_table, annotations_table, gene_attribute, organism):
+    # read quantification results
+    col_names = pd.read_csv(quantification_table, sep="\t", nrows=0).columns
+    types_dict = {gene_attribute: str}
+    types_dict.update({col: float for col in col_names if col not in types_dict})
+    quantification = pd.read_csv(quantification_table, sep="\t", index_col=0, dtype=types_dict)
+    # read annotations
+    annotations = pd.read_csv(annotations_table, sep="\t", index_col=0, dtype="str")
+    # combine annotations and quantification results
+    quant_merged_table = pd.concat([annotations, quantification], axis=1, join="inner").sort_index()
+    quant_merged_table.index.names = [gene_attribute]
+    # save results
+    if organism == "pathogen":
+        quant_merged_table.to_csv("pathogen_combined_quant_annotations.tsv", sep="\t")
+    elif organism == "host":
+        quant_merged_table.to_csv("host_combined_quant_annotations.tsv", sep="\t")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="""Combine annotations with quantification results""")
+    parser.add_argument(
+        "-q", "--quantification_table", metavar="<quantification_table>", help="Path to quantification results "
+    )
+    parser.add_argument(
+        "-annotations", "--annotations", metavar="<annotations>", help="Path to annotations extracted from gff file"
+    )
+    parser.add_argument("-a", "--gene_attribute", metavar="<gene_attribute>", help="gene attribute")
+    parser.add_argument("-org", "--organism", metavar="<organism>", help="host, pathogen or both")
+
+    args = parser.parse_args()
+
+    # combine annotations with quantification results
+    combine_annotations_quant(args.quantification_table, args.annotations, args.gene_attribute, args.organism)
diff --git a/bin/combine_tables.py b/bin/combine_tables.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Dec  6 09:26:36 2019
+
+@author: B.Mika-Gospodorz
+
+
+Input files: list of .txt files with mapping statistics for different samples generated with count_multi_mapped_reads.sh, count_uniquely_mapped_read_pairs, count_uniquely_mapped_read_pairs.sh or count_multi_mapped_read_pairs.sh
+Output file: .tsv file with combined statistics from all samples
+Description: Used to collect mapping statistics from all samples
+"""
+
+
+import argparse
+
+import pandas as pd
+
+
+# function to combine quantification results
+def combine_stat_tables(input_files, suffix):
+    combined_tables = pd.DataFrame()  # initialize data frame
+    for input_file in input_files:  # iterate over input files
+        # read file
+        table_read = pd.read_csv(input_file, sep=" ", header=None, index_col=1)
+        # extract sample name
+        sample_name = str(table_read[0][0])
+        table_read = table_read.drop(0, 1)
+        table_read.columns = [sample_name]
+        # add new table to combined_tables data frame
+        combined_tables = pd.concat([combined_tables, table_read], axis=1)
+    # transpose table to have 'host' and 'pathogen' labels as column names
+    combined_tables = combined_tables.transpose()
+    combined_tables.columns = ["host_" + suffix, "pathogen_" + suffix]
+    return combined_tables
+
+
+parser = argparse.ArgumentParser(description="""Combine mapping statistis from all samples""")
+parser.add_argument(
+    "-i",
+    "--input_files",
+    metavar="<input_files>",
+    nargs="+",
+    default="*.txt",
+    help="Path to mapping statistic results ",
+)
+parser.add_argument("-o", "--output_dir", metavar="<output>", help="output dir", default=".")
+parser.add_argument("-s", "--suffix", metavar="<suffix>", help="uniquely_mapped/multi_mapped", default=".")
+args = parser.parse_args()
+
+# combine results
+combineTables = combine_stat_tables(args.input_files, args.suffix)
+
+# save results
+combineTables.to_csv(args.output_dir, sep="\t")