Skip to content

Commit

Permalink
filter out low coverage contigs
Browse files Browse the repository at this point in the history
  • Loading branch information
andrewjpage committed Feb 1, 2017
1 parent f65755d commit 9bff2c0
Show file tree
Hide file tree
Showing 12 changed files with 104 additions and 42 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
v0.1.2 - 1/02/17
------
Allow for low coverage contigs to be filtered out.

v0.1.1 - 31/01/17
------
Increase default min contig length to be about 4 times the average fragment size.
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.1.1
0.1.2
7 changes: 7 additions & 0 deletions plasmidtron/InputTypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,3 +53,10 @@ def is_max_kmers_threshold_valid(value_str):
return max_kmers_threshold
raise argparse.ArgumentTypeError("Invalid maximum kmers threshold, it must be between 10 and 255, and greater than the minimum kmer value, but ideally greater than the coverage.")

def is_min_spades_contig_coverage_valid(value_str):
if value_str.isdigit():
min_coverage = int(value_str)
if min_coverage >= 0 and min_coverage <= 1000000:
return min_coverage
raise argparse.ArgumentTypeError("Invalid minimum SPAdes contig coverage, try between 20 and 30.")

2 changes: 2 additions & 0 deletions plasmidtron/KmcFilter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import logging
import tempfile
import subprocess
import shutil
from plasmidtron.FastqReadNames import FastqReadNames

'''Given a kmer database extract filter a FASTQ file for a sample'''
Expand Down Expand Up @@ -40,3 +41,4 @@ def filter_fastq_file_against_kmers(self):
def cleanup(self):
os.remove(self.intermediate_filtered_fastq)
os.remove(self.read_names_file)
shutil.rmtree(self.temp_working_dir)
42 changes: 25 additions & 17 deletions plasmidtron/PlasmidTron.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,18 @@ class PlasmidTron:
def __init__(self,options):
self.start_time = int(time.time())
self.logger = logging.getLogger(__name__)
self.output_directory = options.output_directory
self.file_of_trait_fastqs = options.file_of_trait_fastqs
self.file_of_nontrait_fastqs = options.file_of_nontrait_fastqs
self.verbose = options.verbose
self.threads = options.threads
self.kmer = options.kmer
self.min_kmers_threshold = options.min_kmers_threshold
self.max_kmers_threshold = options.max_kmers_threshold
self.spades_exec = options.spades_exec
self.min_contig_len = options.min_contig_len
self.action = options.action
self.output_directory = options.output_directory
self.file_of_trait_fastqs = options.file_of_trait_fastqs
self.file_of_nontrait_fastqs = options.file_of_nontrait_fastqs
self.verbose = options.verbose
self.threads = options.threads
self.kmer = options.kmer
self.min_kmers_threshold = options.min_kmers_threshold
self.max_kmers_threshold = options.max_kmers_threshold
self.spades_exec = options.spades_exec
self.min_contig_len = options.min_contig_len
self.action = options.action
self.min_spades_contig_coverage = options.min_spades_contig_coverage

def run(self):
os.makedirs(self.output_directory)
Expand All @@ -51,6 +52,7 @@ def run(self):
kmc_filter.filter_fastq_file_against_kmers()
kmc_filters.append(kmc_filter)

kmc_fastas = []
spades_assemblies = []
for sample in trait_samples:
spades_assembly = SpadesAssembly( sample,
Expand All @@ -59,7 +61,8 @@ def run(self):
self.kmer,
self.spades_exec,
self.min_contig_len,
True)
True,
self.min_spades_contig_coverage)
spades_assembly.run()
self.logger.info("Rescaffold 1st assembly with all reads")
# Next we want to scaffold by using all of the original reads to join up the small contigs.
Expand All @@ -71,6 +74,7 @@ def run(self):
self.kmer,
self.min_kmers_threshold,
self.max_kmers_threshold)
kmc_fastas.append(kmc_fasta)

# Pull out any reads matching the kmers found in the assembly
self.logger.info("Pull out reads from original fastq files matching assembly kmers")
Expand All @@ -86,16 +90,17 @@ def run(self):
spades_assembly.cleanup()

self.logger.info("Reassemble with SPAdes")
spades_assembly = SpadesAssembly( sample,
final_spades_assembly = SpadesAssembly( sample,
self.output_directory,
self.threads,
self.kmer,
self.spades_exec,
self.min_contig_len,
False)
spades_assembly.run()
spades_assemblies.append(spades_assembly)
print(spades_assembly.filtered_spades_assembly_file()+"\n")
False,
self.min_spades_contig_coverage)
final_spades_assembly.run()
spades_assemblies.append(final_spades_assembly)
print(final_spades_assembly.filtered_spades_assembly_file()+"\n")

method_file = Methods(os.path.join(self.output_directory, 'methods_summary.txt'), trait_samples, nontrait_samples, self.min_kmers_threshold, self.min_contig_len, self.start_time, self.spades_exec)
method_file.create_file()
Expand All @@ -108,6 +113,9 @@ def run(self):

kmc_complex.cleanup()

for kmc_fasta in kmc_fastas:
kmc_fasta.cleanup()

for kmc_filter in kmc_filters:
kmc_filter.cleanup()

Expand Down
20 changes: 19 additions & 1 deletion plasmidtron/SpadesAssembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,12 @@
import tempfile
import subprocess
import shutil
import re
from Bio import SeqIO

'''Assemble a filtered sample with SPAdes'''
class SpadesAssembly:
def __init__(self, sample, output_directory, threads, kmer, spades_exec, minimum_length, use_temp_directory):
def __init__(self, sample, output_directory, threads, kmer, spades_exec, minimum_length, use_temp_directory,min_spades_contig_coverage):
self.logger = logging.getLogger(__name__)
self.output_directory = output_directory
self.sample = sample
Expand All @@ -16,6 +17,7 @@ def __init__(self, sample, output_directory, threads, kmer, spades_exec, minimum
self.minimum_length = minimum_length
self.spades_exec = spades_exec
self.use_temp_directory = use_temp_directory
self.min_spades_contig_coverage = min_spades_contig_coverage

if not os.path.exists(self.output_directory):
os.makedirs(self.output_directory)
Expand All @@ -35,13 +37,29 @@ def spades_assembly_file(self):

def filtered_spades_assembly_file(self):
return os.path.join(self.spades_output_directory(), 'filtered_scaffolds.fasta')

def sequence_coverage(self, contig_name):
# SPAdes add the coverage to the contig name.
#NODE_447_length_1644_cov_25.5669
m = re.search('cov_([\d]+)', contig_name)
if m and m.group(0):
return m.group(1)
else:
# return a number big enough that it will always keep the contig
return self.min_spades_contig_coverage + 1

def remove_small_contigs(self,input_file, output_file):
with open(input_file, "r") as spades_input_file, open(output_file, "w") as spades_output_file:
sequences = []
for record in SeqIO.parse(spades_input_file, "fasta"):
if self.sequence_coverage(record.id) < self.min_spades_contig_coverage:
self.logger.info("Excluding contig with coverage of "+ str(self.sequence_coverage(record.id))+ " from "+ self.spades_assembly_file())
continue

if len(record.seq) > self.minimum_length:
sequences.append(record)
else:
self.logger.info("Excluding contig of length "+ str(len(record.seq))+ " from "+ self.spades_assembly_file() )

SeqIO.write(sequences, spades_output_file, "fasta")

Expand Down
27 changes: 14 additions & 13 deletions plasmidtron/tests/PlasmidTron_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,19 @@
from plasmidtron.PlasmidTron import PlasmidTron

class Options:
def __init__(self,output_directory, file_of_trait_fastqs, file_of_nontrait_fastqs, verbose, threads, kmer, min_kmers_threshold,max_kmers_threshold, spades_exec, min_contig_len, action):
self.output_directory = output_directory
self.file_of_trait_fastqs = file_of_trait_fastqs
self.file_of_nontrait_fastqs = file_of_nontrait_fastqs
self.verbose = verbose
self.threads = threads
self.kmer = kmer
self.min_kmers_threshold = min_kmers_threshold
self.spades_exec = spades_exec
self.min_contig_len = min_contig_len
self.max_kmers_threshold = max_kmers_threshold
self.action = action
def __init__(self,output_directory, file_of_trait_fastqs, file_of_nontrait_fastqs, verbose, threads, kmer, min_kmers_threshold,max_kmers_threshold, spades_exec, min_contig_len, action, min_spades_contig_coverage):
self.output_directory = output_directory
self.file_of_trait_fastqs = file_of_trait_fastqs
self.file_of_nontrait_fastqs = file_of_nontrait_fastqs
self.verbose = verbose
self.threads = threads
self.kmer = kmer
self.min_kmers_threshold = min_kmers_threshold
self.spades_exec = spades_exec
self.min_contig_len = min_contig_len
self.max_kmers_threshold = max_kmers_threshold
self.action = action
self.min_spades_contig_coverage = min_spades_contig_coverage

test_modules_dir = os.path.dirname(os.path.realpath(__file__))
data_dir = os.path.join(test_modules_dir, 'data','plasmidtron')
Expand All @@ -26,7 +27,7 @@ def test_small_valid_chrom_plasmid(self):
'''Given small FASTQS of simulated reads, with a chromosome in 1 and chromosome+plasmid in the other run the whole pipeline'''
if os.path.exists(os.path.join(data_dir,'out')):
shutil.rmtree(os.path.join(data_dir,'out'))
options = Options(os.path.join(data_dir,'out'), os.path.join(data_dir,'traits.csv'), os.path.join(data_dir,'nontraits.csv'),True, 1, 61, 20,200, 'spades.py', 100,'union')
options = Options(os.path.join(data_dir,'out'), os.path.join(data_dir,'traits.csv'), os.path.join(data_dir,'nontraits.csv'),True, 1, 61, 20,200, 'spades.py', 100,'union', 1)

plasmid_tron = PlasmidTron(options)
plasmid_tron.run()
Expand Down
15 changes: 14 additions & 1 deletion plasmidtron/tests/SpadesAssembly_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ class TestSpadesAssembly(unittest.TestCase):
def test_filtering_small_contigs(self):
sample = SampleData('/path/to/sample_1.fastq.gz','/path/to/sample_2.fastq.gz' )

s = SpadesAssembly(sample, 'abc', 1, 1, '', 20, False )
s = SpadesAssembly(sample, 'abc', 1, 1, '', 20, False, 0 )
s.remove_small_contigs(os.path.join(data_dir, 'assembly_with_small_contigs.fa'), os.path.join(data_dir, 'actual_assembly_without_small_contigs.fa'))

with open(os.path.join(data_dir, 'actual_assembly_without_small_contigs.fa'), 'r') as actual_file, open(os.path.join(data_dir, 'expected_assembly_without_small_contigs.fa'), 'r') as expected_file:
Expand All @@ -20,4 +20,17 @@ def test_filtering_small_contigs(self):

self.assertEqual(actual_config_content,expected_config_content)
os.remove(os.path.join(data_dir, 'actual_assembly_without_small_contigs.fa'))

def test_filtering_low_coverage_contigs(self):
sample = SampleData('/path/to/sample_1.fastq.gz','/path/to/sample_2.fastq.gz' )

s = SpadesAssembly(sample, 'abc', 1, 1, '', 1, False, 24 )
s.remove_small_contigs(os.path.join(data_dir, 'assembly_with_small_contigs.fa'), os.path.join(data_dir, 'actual_assembly_without_low_coverage_contigs.fa'))

with open(os.path.join(data_dir, 'actual_assembly_without_low_coverage_contigs.fa'), 'r') as actual_file, open(os.path.join(data_dir, 'expected_assembly_coverage.fa'), 'r') as expected_file:
actual_config_content = actual_file.read()
expected_config_content = expected_file.read()

self.assertEqual(actual_config_content,expected_config_content)
os.remove(os.path.join(data_dir, 'actual_assembly_without_low_coverage_contigs.fa'))

Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
>small_contig1
>NODE_1_length_9_cov_58.2272
AAAAAAAAA
>small_contig2
>NODE_2_length_9_cov_54.8054
AAAAAAAAA
>large_contig3
>NODE_3_length_24_cov_24.4284
CCCCCCCCCCCCCCCCCCCCCCCC
>small_contig4
>NODE_4_length_9_cov_23.3323
AAAAAAAAA
>small_contig5
>NODE_5_length_9_cov_22.1132
AAAAAAAAA
>large_contig6
>NODE_6_length_24_cov_100.9852
CCCCCCCCCCCCCCCCCCCCCCCC
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
>NODE_1_length_9_cov_58.2272
AAAAAAAAA
>NODE_2_length_9_cov_54.8054
AAAAAAAAA
>NODE_3_length_24_cov_24.4284
CCCCCCCCCCCCCCCCCCCCCCCC
>NODE_6_length_24_cov_100.9852
CCCCCCCCCCCCCCCCCCCCCCCC
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
>large_contig3
>NODE_3_length_24_cov_24.4284
CCCCCCCCCCCCCCCCCCCCCCCC
>large_contig6
>NODE_6_length_24_cov_100.9852
CCCCCCCCCCCCCCCCCCCCCCCC
3 changes: 2 additions & 1 deletion scripts/plasmidtron
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ parser.add_argument('file_of_trait_fastqs', help='File of filenames of trait (ca
parser.add_argument('file_of_nontrait_fastqs', help='File of filenames of nontrait (control) FASTQs', type=InputTypes.is_file_of_nontrait_fastqs_valid)
parser.add_argument('--action', '-a', help='Control how the traits kmers are filtered for assembly [%(default)s]', choices=['intersection','union'], default = 'union')
parser.add_argument('--kmer', '-k', help='Kmer to use, depends on read length [%(default)s]', type=InputTypes.is_kmer_valid, default = 51)
parser.add_argument('--min_contig_len', '-l', help='Minimum contig length in final assembly [%(default)s]', type=InputTypes.is_min_contig_len_valid, default = 1600)
parser.add_argument('--min_contig_len', '-l', help='Minimum contig length in final assembly [%(default)s]', type=InputTypes.is_min_contig_len_valid, default = 3000)
parser.add_argument('--min_spades_contig_coverage', '-s', help='Filter out contigs with low coverage. Set to 0 to keep all. [%(default)s]', type=InputTypes.is_min_spades_contig_coverage_valid, default = 25)
parser.add_argument('--min_kmers_threshold', '-m', help='Exclude k-mers occurring less than this [%(default)s]', type=InputTypes.is_min_kmers_threshold_valid, default = 25)
parser.add_argument('--max_kmers_threshold', '-x', help='Exclude k-mers occurring more than this [%(default)s]', type=InputTypes.is_max_kmers_threshold_valid, default = 254)
parser.add_argument('--threads', '-t', help='Number of threads [%(default)s]', type=InputTypes.is_threads_valid, default = 1)
Expand Down

0 comments on commit 9bff2c0

Please sign in to comment.