Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

96 mapping statistics #97

Open
wants to merge 5 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,6 @@ results/
testing/
testing*
*.pyc
.nf-test/
test/
.nf-test.log
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,3 +90,5 @@ You can cite the `nf-core` publication as follows:
> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.
>
> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).

nextflow run main.nf -profile test_hackathon,docker --outdir=test
104 changes: 104 additions & 0 deletions bin/calculate_TPM_HTSeq.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
#!/usr/bin/env Rscript

#-------------------------
#
# Description:
# Script to count TPM values for HTSeq quantification results
#
# Created by B. Mika-Gospodorz
# Date: 29th May 2020
#
# Input args: [1] = quantification_results_uniquely_mapped.tsv - HTSeq quantification results
# [2] = host gene attribute used for quantification, e.g. 'gene_id'
# [3] = pathogen gff file with gene features (from 3rd col) replaced with 'quant'
# [4] = host gff file with gene features (from 3rd col) replaced with 'quant'
# Output file: quantification_results_uniquely_mapped_NumReads_TPM.tsv
#
#-------------------------

library(plyr)
library(rtracklayer)


args = commandArgs(trailingOnly=TRUE)
# read quantification table
gene_attribute <- args[2]
table_htseq <- read.table(args[1], sep="\t", stringsAsFactors=F, header = T,row.names = gene_attribute)
pathogen_gff <- import(args[3])
host_gff <- import(args[4])


# function to extract gene length from gff file
extract_transcript_length <- function(x,host_gff,gene_attribute){
#find annotations that contain host transcripts
h_tr <- host_gff[mcols(host_gff)[gene_attribute][[1]]==x]
# Filter quant features
quants <- h_tr[h_tr$type == "quant",]
# merge overlapping features, so that the same bases are covered by reduced collection of features
t_length <- sum(width(reduce(quants)))
return(t_length)
}

######################### extract gene length #######################################

### pathogen
names_gff_pathogen <- mcols(pathogen_gff)[gene_attribute][[1]] # extract gene names from gff file
common_pathogen = intersect(rownames(table_htseq), names_gff_pathogen) # find positions of pathogen genes in quantification table
pathogen_table_htseq <- table_htseq[common_pathogen,] # extract pathogen quantification results
pathogen_gff_match <- match(common_pathogen,names_gff_pathogen) # find positions of corresponding genes in gff file
pathogen_table_htseq <- cbind(length = pathogen_gff@ranges@width[pathogen_gff_match],pathogen_table_htseq) # extract gene length from gff file and combine it with quantification table


### host
names_gff_host <- mcols(host_gff)[gene_attribute][[1]] # extract gene names from gff file
lack_of_attribute <- which(is.na(names_gff_host)) #find positions without gene_attribute (eg.genes that don't have transcript_id attribute)
if (length(lack_of_attribute)> 0) {
host_gff <- host_gff[-lack_of_attribute] #remove positions without gene_attribute
names_gff_host <- names_gff_host[-lack_of_attribute] # extract gene names that contain gene_attribute
}
common_host = intersect(rownames(table_htseq),names_gff_host) # find positions of host genes in quantification table
host_table_htseq <- table_htseq[common_host,] #extract quantification results of host genes
transcript_length <- sapply(common_host, extract_transcript_length, host_gff=host_gff,gene_attribute = gene_attribute,simplify = TRUE) #extract gene length
host_table_htseq <- cbind(length = transcript_length,host_table_htseq) # add gene length into quantification table


# combine host and pathogen tables
htseq_table_with_gene_length = rbind(pathogen_table_htseq,host_table_htseq)


######################### calculate TPM values #######################################

# function to calculate TPMs
tpm <- function(counts, lengths) {
rate <- counts / lengths
return(rate / sum(rate) * 1e6)
}

# function to add _TPM suffix
rename_add_TPM <- function(x) {
paste(x,"_TPM",sep='')
}

# function to add _NumReads suffix
rename_add_NumReads <- function(x) {
paste(x,"_NumReads",sep='')
}


# calculate TPMs for each gene in each sample
TPMs <- apply(htseq_table_with_gene_length[2:dim(htseq_table_with_gene_length)[2]], 2, function(x) tpm(x, htseq_table_with_gene_length$length))
colnames(TPMs) <- colnames(htseq_table_with_gene_length[,-1])
# add TPM suffix to column names with TPM values
colnames(TPMs) <-sapply(colnames(TPMs),function(x) rename_add_TPM(x))
# add NumReads suffix to column names with NumReads values
colnames_NR <- sapply(colnames(htseq_table_with_gene_length[,-1]),function(x) rename_add_NumReads(x))
# add 'length' name for column which stores gene length
colnames(htseq_table_with_gene_length) <- (c('length',colnames_NR))

# combine results
quant_results <- cbind(name = rownames(TPMs), htseq_table_with_gene_length,TPMs)
names(quant_results)[1] <- gene_attribute

# save results
write.table(quant_results,file = "quantification_results_uniquely_mapped_NumReads_TPM.tsv",sep = "\t", row.names = F, quote = F)

115 changes: 115 additions & 0 deletions bin/collect_quantification_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Jul 4 15:00:57 2019

@author: B. Mika-Gospodorz

Input file: list of quantification tables
Output files: quantification_stats_*.tsv, quantification_results_*.tsv (HTSeq option) or
Description: Used to merge quantification results from all samples
"""

import argparse

import pandas as pd


# function to merge HTSeq quantification results
def collect_quantification_data_HTseq(input_files, profile, gene_attribute):
# initiate merged quantification data frame
quant_merged_table = pd.DataFrame()
for input_file in input_files: # iterate over sample results
# read quantification data
quant_table = pd.read_csv(input_file, sep="\t", header=0)
if input_file == input_files[0]: # initiate first column of quant_merged_table data frame
quant_merged_table = quant_table
else: # extend quant_merged_table data frame with results of new sample
quant_merged_table = pd.concat([quant_merged_table, quant_table], axis=1)
quant_merged_table.index.names = [gene_attribute]
# sort quant_merged_table by column labels
quant_merged_table = quant_merged_table.sort_index(axis=1)

# extract last 5 rows of HTSeq quantification table that contain statistics and save results
alignment_stats = quant_merged_table["__no_feature":"__alignment_not_unique"]
alignment_stats.to_csv("quantification_stats_" + profile + ".tsv", sep="\t")
# remove statistics from quantification results and save quant_merged_table
quant_merged_table = quant_merged_table.drop(
["__no_feature", "__ambiguous", "__too_low_aQual", "__not_aligned", "__alignment_not_unique"]
)
quant_merged_table.to_csv("quantification_results_" + profile + ".tsv", sep="\t")


# function to merge Salmon quantification results
def collect_quantification_data_salmon(input_files, gene_attribute, organism):
for input_file in input_files: # iterate over sample results
if organism == "host_gene_level": # read tximport results
quant_table = pd.read_csv(input_file, sep="\t", header=0, index_col=0)
else:
if organism == "both": # read quantification table containing both host and pathogen transcripts
quant_table = pd.read_csv(
input_file + "/quant.sf",
sep="\t",
header=0,
index_col=0,
dtype={"Name": str, "Length": int, "EffectiveLength": float, "TPM": float, "NumReads": float},
)
elif organism == "pathogen": # read quantification table containing pathogen transcripts
quant_table = pd.read_csv(
input_file + "/pathogen_quant.sf",
sep="\t",
header=0,
index_col=0,
dtype={"Name": str, "Length": int, "EffectiveLength": float, "TPM": float, "NumReads": float},
)
elif organism == "host": # read quantification table containing host transcripts
quant_table = pd.read_csv(
input_file + "/host_quant.sf",
sep="\t",
header=0,
index_col=0,
dtype={"Name": str, "Length": int, "EffectiveLength": float, "TPM": float, "NumReads": float},
)
# create new column names that keep sample name
name_file = input_file.split("/")
new_col = [name_file[0] + "_" + column for column in quant_table.columns]
quant_table.columns = new_col
if input_file == input_files[0]: # initiate first column
quant_merged_table = quant_table
else: # extend quant_merged_table data frame with results of new sample
quant_merged_table = pd.concat([quant_merged_table, quant_table], axis=1)
quant_merged_table.index.names = [gene_attribute]
# sort quant_merged_table by column labels
quant_merged_table = quant_merged_table.sort_index(axis=1)
# save results
if organism == "both":
quant_merged_table.to_csv("combined_quant.tsv", sep="\t")
elif organism == "pathogen":
quant_merged_table.to_csv("pathogen_combined_quant.tsv", sep="\t")
elif organism == "host":
quant_merged_table.to_csv("host_combined_quant.tsv", sep="\t")
elif organism == "host_gene_level":
quant_merged_table.to_csv("host_combined_gene_level.tsv", sep="\t")


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="""Combine counts from all samples""")
parser.add_argument(
"-i", "--input_files", metavar="<input_files>", nargs="+", help="Path to quantification results "
)
parser.add_argument("-q", "--quantifier", metavar="<quantifier>", help="name of quantifier")
parser.add_argument("-a", "--gene_attribute", metavar="<gene_attribute>", help="gene attribute")
parser.add_argument(
"-org",
"--organism",
metavar="<organism>",
help="host, pathogen, both, host_gene_level - option avaiable for Salmon",
)

args = parser.parse_args()

# collect either Salmon or HTSeq quantification results
if args.quantifier == "htseq":
collect_quantification_data_HTseq(args.input_files, args.quantifier, args.gene_attribute)
if args.quantifier == "salmon":
collect_quantification_data_salmon(args.input_files, args.gene_attribute, args.organism)
38 changes: 38 additions & 0 deletions bin/collect_total_raw_read_pairs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Jul 23 14:45:52 2020

@author: B. Mika-Gospodorz

Input file: tsv file containing number of reads in fastq files created with count_total_reads.sh
Output file: total_raw_read_pairs_fastq.tsv file that contains number of raw read pairs in each sample
Description: Used to collect number of total raw read paires in each sample
"""

import argparse

import pandas as pd

parser = argparse.ArgumentParser(description="""collect total raw read paires""")
parser.add_argument(
"-i", "--input_files", metavar="<input_files>", default="*.tsv", help="Path to the total_raw_reads_fastq.tsv"
)
args = parser.parse_args()


# read total_raw_reads_fastq.tsv file
df_stats = pd.read_csv(args.input_files, sep="\t", index_col=0, header=None)

# extract fastq file names
index_names = df_stats.index

# remove '_1' and '_2' suffixes from fastq file names
sample_names = [name.rsplit("_", 1)[0] for name in index_names]

# define new sample names and remove duplicated results (each sample contains results from both *_1 and *_2 fastq files
df_stats.index = sample_names
df_stats = df_stats.reset_index().drop_duplicates(subset="index", keep="last").set_index("index")

# save results
df_stats.to_csv("total_raw_read_pairs_fastq.tsv", sep="\t", header=False)
52 changes: 52 additions & 0 deletions bin/combine_quant_annotations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 9 12:07:56 2020

@author: B.Mika-Gospdoorz

Input files: .tsv quantification table with combined results from all samples
.tsv file with annotations extracted from gff using extract_annotations_from_gff.py
Output file: *combined_quant_gene_level_annotations.tsv with gene annotations and quantification results
Description: Used to combine annotations with quantification results
"""

import argparse

import pandas as pd


# function to combine annotations with quantification results
def combine_annotations_quant(quantification_table, annotations_table, gene_attribute, organism):
# read quantification results
col_names = pd.read_csv(quantification_table, sep="\t", nrows=0).columns
types_dict = {gene_attribute: str}
types_dict.update({col: float for col in col_names if col not in types_dict})
quantification = pd.read_csv(quantification_table, sep="\t", index_col=0, dtype=types_dict)
# read annotations
annotations = pd.read_csv(annotations_table, sep="\t", index_col=0, dtype="str")
# combine annotations and quantification results
quant_merged_table = pd.concat([annotations, quantification], axis=1, join="inner").sort_index()
quant_merged_table.index.names = [gene_attribute]
# save results
if organism == "pathogen":
quant_merged_table.to_csv("pathogen_combined_quant_annotations.tsv", sep="\t")
elif organism == "host":
quant_merged_table.to_csv("host_combined_quant_annotations.tsv", sep="\t")


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="""Combine annotations with quantification results""")
parser.add_argument(
"-q", "--quantification_table", metavar="<quantification_table>", help="Path to quantification results "
)
parser.add_argument(
"-annotations", "--annotations", metavar="<annotations>", help="Path to annotations extracted from gff file"
)
parser.add_argument("-a", "--gene_attribute", metavar="<gene_attribute>", help="gene attribute")
parser.add_argument("-org", "--organism", metavar="<organism>", help="host, pathogen or both")

args = parser.parse_args()

# combine annotations with quantification results
combine_annotations_quant(args.quantification_table, args.annotations, args.gene_attribute, args.organism)
55 changes: 55 additions & 0 deletions bin/combine_tables.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Dec 6 09:26:36 2019

@author: B.Mika-Gospodorz


Input files: list of .txt files with mapping statistics for different samples generated with count_multi_mapped_reads.sh, count_uniquely_mapped_read_pairs, count_uniquely_mapped_read_pairs.sh or count_multi_mapped_read_pairs.sh
Output file: .tsv file with combined statistics from all samples
Description: Used to collect mapping statistics from all samples
"""


import argparse

import pandas as pd


# function to combine quantification results
def combine_stat_tables(input_files, suffix):
combined_tables = pd.DataFrame() # initialize data frame
for input_file in input_files: # iterate over input files
# read file
table_read = pd.read_csv(input_file, sep=" ", header=None, index_col=1)
# extract sample name
sample_name = str(table_read[0][0])
table_read = table_read.drop(0, 1)
table_read.columns = [sample_name]
# add new table to combined_tables data frame
combined_tables = pd.concat([combined_tables, table_read], axis=1)
# transpose table to have 'host' and 'pathogen' labels as column names
combined_tables = combined_tables.transpose()
combined_tables.columns = ["host_" + suffix, "pathogen_" + suffix]
return combined_tables


parser = argparse.ArgumentParser(description="""Combine mapping statistis from all samples""")
parser.add_argument(
"-i",
"--input_files",
metavar="<input_files>",
nargs="+",
default="*.txt",
help="Path to mapping statistic results ",
)
parser.add_argument("-o", "--output_dir", metavar="<output>", help="output dir", default=".")
parser.add_argument("-s", "--suffix", metavar="<suffix>", help="uniquely_mapped/multi_mapped", default=".")
args = parser.parse_args()

# combine results
combineTables = combine_stat_tables(args.input_files, args.suffix)

# save results
combineTables.to_csv(args.output_dir, sep="\t")
Loading
Loading