Skip to content

Commit

Permalink
add scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
rpetit3 committed Feb 14, 2023
1 parent 1223bc0 commit 6ff9864
Show file tree
Hide file tree
Showing 3 changed files with 214 additions and 0 deletions.
13 changes: 13 additions & 0 deletions CITATION.cff
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
cff-version: 1.2.0
message: "If you use Bactopia, please cite it as below."
authors:
- family-names: "Petit III"
given-names: "Robert A. "
orcid: "https://orcid.org/0000-0002-1350-9426"
- family-names: "Read"
given-names: "Timothy D."
orcid: "https://orcid.org/0000-0001-8966-9680"
title: "Bactopia: a Flexible Pipeline for Complete Analysis of Bacterial Genomes. mSystems. 5 (2020)"
doi: 10.1128/mSystems.00190-20
url: "https://github.com/bactopia/bactopia"
version: 1.4.1
94 changes: 94 additions & 0 deletions bin/bactopia-teton
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
#! /bin/bash
VERSION=1.0.0
PREFIX=$1

# If no user input, print usage
if [[ $# == 0 ]]; then
echo "bactopia-teton - v${VERSION}"
echo ""
echo "bactopia-teton <OPT1> ... <OPTN>"
echo ""
exit
fi

if [ "$is_tarball" == "true" ]; then
mkdir database
tar -xzf $db -C database
KRAKEN_DB=\$(find database/ -name "hash.k2d" | sed 's=hash.k2d==')
else
KRAKEN_DB=\$(find $db/ -name "hash.k2d" | sed 's=hash.k2d==')
fi

kraken2 \\
--db \$KRAKEN_DB \\
--threads $task.cpus \\
--unclassified-out $unclassified \\
--classified-out $classified \\
--report ${prefix}.kraken2.report.txt \\
--gzip-compressed \\
$paired \\
$options.args \\
$reads > kracken.out

# Get read length
if [ "${params.bracken_read_length}" == "0" ]; then
OBS_READ_LENGTH=\$(zcat ${reads[0]} | fastq-scan -q | jq -r '.qc_stats.read_median')
echo \$OBS_READ_LENGTH
# Pre-built Bracken databases come with 50,75,100,150,200,250,300, split the difference
if [ "\$OBS_READ_LENGTH" -gt 275 ]; then
READ_LENGTH="300"
elif [ "\$OBS_READ_LENGTH" -gt 225 ]; then
READ_LENGTH="250"
elif [ "\$OBS_READ_LENGTH" -gt 175 ]; then
READ_LENGTH="200"
elif [ "\$OBS_READ_LENGTH" -gt 125 ]; then
READ_LENGTH="150"
elif [ "\$OBS_READ_LENGTH" -gt 85 ]; then
READ_LENGTH="100"
elif [ "\$OBS_READ_LENGTH" -gt 65 ]; then
READ_LENGTH="75"
else
READ_LENGTH="50"
fi
else
# use user defined read length
READ_LENGTH="${params.bracken_read_length}"
fi

bracken \\
$options.args2 \\
-d \$KRAKEN_DB \\
-r \$READ_LENGTH \\
-i ${prefix}.kraken2.report.txt \\
-w ${prefix}.bracken.report.txt \\
-o bracken.temp

# Sort bracken report by 'fraction_total_reads' (column 7)
head -n 1 bracken.temp > ${prefix}.bracken.abundances.txt
grep -v "fraction_total_reads\$" bracken.temp | sort -k 7 -rn >> ${prefix}.bracken.abundances.txt

# Compress Kraken FASTQs
pigz -p $task.cpus *.fastq

# Adjust bracken to include unclassified and produce summary
kraken-bracken-summary.py \\
${prefix} \\
${prefix}.kraken2.report.txt \\
${prefix}.bracken.report.txt \\
${prefix}.bracken.abundances.txt

# Create a Krona report from reports
if [ "${params.skip_krona}" == "false" ]; then
# Kraken2
kreport2krona.py \\
--report ${prefix}.kraken2.report.txt \\
--output kraken2-krona.temp
ktImportText -o ${prefix}.kraken2.krona.html kraken2-krona.temp

# Bracken
kreport2krona.py \\
--report ${prefix}.bracken.report.txt \\
--output bracken-krona.temp
ktImportText -o ${prefix}.bracken.krona.html bracken-krona.temp
rm *-krona.temp
fi
107 changes: 107 additions & 0 deletions bin/kraken-bracken-summary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
#! /usr/bin/env python3
"""
kraken-bracken-summary.py
"""
PROGRAM = "kraken-bracken-summary"
VERSION = "2.2.0"
import sys

def kraken2_unclassified_count(kraken2_report):
"""
0.23 4500 4500 U 0 unclassified
"""
with open(kraken2_report, 'rt') as fh:
for line in fh:
line = line.rstrip()
cols = line.split("\t")
if cols[3] == "U":
return int(cols[2])

def braken_root_count(bracken_report):
"""
100.00 1976389 0 R 1 root
"""
with open(bracken_report, 'rt') as fh:
for line in fh:
line = line.rstrip()
cols = line.split("\t")
if cols[3] == "R":
return float(cols[1])

if __name__ == '__main__':
import argparse as ap
import pandas as pd

parser = ap.ArgumentParser(
prog=PROGRAM,
conflict_handler='resolve',
description=(
f'{PROGRAM} (v{VERSION}) - Update the Bracken abundances with unclassified counts'
)
)

parser.add_argument('prefix', metavar="PREFIX", type=str,
help='Prefix to use for output files')
parser.add_argument('kraken2_report', metavar="KRAKEN2_REPORT", type=str,
help='The Kraken2 report')
parser.add_argument('bracken_report', metavar="BRACKEN_REPORT", type=str,
help='The BRacken updated Kraken2 report')
parser.add_argument('bracken_abundances', metavar="BRACKEN_ABUNDANCES", type=str,
help='The Bracken output with abundances')
parser.add_argument('--version', action='version',
version=f'{PROGRAM} {VERSION}')

if len(sys.argv) == 1:
parser.print_help()
sys.exit(0)

args = parser.parse_args()

unclassified_count = kraken2_unclassified_count(args.kraken2_report)
total_count = unclassified_count + braken_root_count(args.bracken_report)

"""
Read Bracken abundances
name taxonomy_id taxonomy_lvl kraken_assigned_reads added_reads new_est_reads fraction_total_reads
Pseudomonas aeruginosa 287 S 591369 1309432 1900801 0.96175
Pseudomonas sp. Y5-11 2749808 S 11682 14249 25931 0.01312
Stutzerimonas stutzeri 316 S 5810 1135 6945 0.00351
"""
bracken = pd.read_csv(args.bracken_abundances, sep='\t')
bracken['fraction_total_reads'] = bracken['new_est_reads'] / total_count
bracken = bracken.sort_values(by='fraction_total_reads', ascending=False)

# Write top two and unclassified
cols = [
'sample',
'bracken_primary_species',
'bracken_primary_species_abundance',
'bracken_secondary_species',
'bracken_secondary_species_abundance',
'bracken_unclassified_abundance'
]
results = [
args.prefix,
bracken['name'].iloc[0] if bracken['fraction_total_reads'].iloc[0] >= 0.01 else "No primary abundance > 1%",
"{0:.5f}".format(bracken['fraction_total_reads'].iloc[0]) if bracken['fraction_total_reads'].iloc[0] >= 0.01 else "",
bracken['name'].iloc[1] if bracken['fraction_total_reads'].iloc[1] >= 0.01 else "No secondary abundance > 1%",
"{0:.5f}".format(bracken['fraction_total_reads'].iloc[1]) if bracken['fraction_total_reads'].iloc[1] >= 0.01 else "",
"{0:.5f}".format(unclassified_count / total_count)
]
with open("{0}.bracken.tsv".format(args.prefix), "wt") as fh_out:
fh_out.write("{}\n".format('\t'.join(cols)))
fh_out.write("{}\n".format('\t'.join(results)))

# Add unclassified to data table and re-sort
unclassified = pd.DataFrame.from_dict({
'name': ['unclassified'],
'taxonomy_id': [0],
'taxonomy_lvl': ['U'],
'kraken_assigned_reads': [unclassified_count],
'added_reads': [0],
'new_est_reads': [unclassified_count],
'fraction_total_reads': [unclassified_count / total_count]
})
bracken = pd.concat([bracken, unclassified], axis=0)
bracken = bracken.sort_values(by='fraction_total_reads', ascending=False)
bracken.to_csv("{0}.bracken.adjusted.abundances.txt".format(args.prefix), sep='\t', float_format='%.5f', index=False)

0 comments on commit 6ff9864

Please sign in to comment.