diff --git a/CHANGELOG b/CHANGELOG index 91d8aca..d6ee428 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,7 @@ +v0.1.2 - 1/02/17 +------ +Allow for low coverage contigs to be filtered out. + v0.1.1 - 31/01/17 ------ Increase default min contig length to be about 4 times the average fragment size. diff --git a/VERSION b/VERSION index 6da28dd..d917d3e 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.1.1 \ No newline at end of file +0.1.2 diff --git a/plasmidtron/InputTypes.py b/plasmidtron/InputTypes.py index f81686a..beff08a 100644 --- a/plasmidtron/InputTypes.py +++ b/plasmidtron/InputTypes.py @@ -53,3 +53,10 @@ def is_max_kmers_threshold_valid(value_str): return max_kmers_threshold raise argparse.ArgumentTypeError("Invalid maximum kmers threshold, it must be between 10 and 255, and greater than the minimum kmer value, but ideally greater than the coverage.") + def is_min_spades_contig_coverage_valid(value_str): + if value_str.isdigit(): + min_coverage = int(value_str) + if min_coverage >= 0 and min_coverage <= 1000000: + return min_coverage + raise argparse.ArgumentTypeError("Invalid minimum SPAdes contig coverage, try between 20 and 30.") + diff --git a/plasmidtron/KmcFilter.py b/plasmidtron/KmcFilter.py index 6bcb714..00e4b17 100644 --- a/plasmidtron/KmcFilter.py +++ b/plasmidtron/KmcFilter.py @@ -2,6 +2,7 @@ import logging import tempfile import subprocess +import shutil from plasmidtron.FastqReadNames import FastqReadNames '''Given a kmer database extract filter a FASTQ file for a sample''' @@ -40,3 +41,4 @@ def filter_fastq_file_against_kmers(self): def cleanup(self): os.remove(self.intermediate_filtered_fastq) os.remove(self.read_names_file) + shutil.rmtree(self.temp_working_dir) diff --git a/plasmidtron/PlasmidTron.py b/plasmidtron/PlasmidTron.py index 13d87bb..74688d0 100755 --- a/plasmidtron/PlasmidTron.py +++ b/plasmidtron/PlasmidTron.py @@ -16,17 +16,18 @@ class PlasmidTron: def __init__(self,options): self.start_time = int(time.time()) self.logger = logging.getLogger(__name__) - self.output_directory = options.output_directory - self.file_of_trait_fastqs = options.file_of_trait_fastqs - self.file_of_nontrait_fastqs = options.file_of_nontrait_fastqs - self.verbose = options.verbose - self.threads = options.threads - self.kmer = options.kmer - self.min_kmers_threshold = options.min_kmers_threshold - self.max_kmers_threshold = options.max_kmers_threshold - self.spades_exec = options.spades_exec - self.min_contig_len = options.min_contig_len - self.action = options.action + self.output_directory = options.output_directory + self.file_of_trait_fastqs = options.file_of_trait_fastqs + self.file_of_nontrait_fastqs = options.file_of_nontrait_fastqs + self.verbose = options.verbose + self.threads = options.threads + self.kmer = options.kmer + self.min_kmers_threshold = options.min_kmers_threshold + self.max_kmers_threshold = options.max_kmers_threshold + self.spades_exec = options.spades_exec + self.min_contig_len = options.min_contig_len + self.action = options.action + self.min_spades_contig_coverage = options.min_spades_contig_coverage def run(self): os.makedirs(self.output_directory) @@ -51,6 +52,7 @@ def run(self): kmc_filter.filter_fastq_file_against_kmers() kmc_filters.append(kmc_filter) + kmc_fastas = [] spades_assemblies = [] for sample in trait_samples: spades_assembly = SpadesAssembly( sample, @@ -59,7 +61,8 @@ def run(self): self.kmer, self.spades_exec, self.min_contig_len, - True) + True, + self.min_spades_contig_coverage) spades_assembly.run() self.logger.info("Rescaffold 1st assembly with all reads") # Next we want to scaffold by using all of the original reads to join up the small contigs. @@ -71,6 +74,7 @@ def run(self): self.kmer, self.min_kmers_threshold, self.max_kmers_threshold) + kmc_fastas.append(kmc_fasta) # Pull out any reads matching the kmers found in the assembly self.logger.info("Pull out reads from original fastq files matching assembly kmers") @@ -86,16 +90,17 @@ def run(self): spades_assembly.cleanup() self.logger.info("Reassemble with SPAdes") - spades_assembly = SpadesAssembly( sample, + final_spades_assembly = SpadesAssembly( sample, self.output_directory, self.threads, self.kmer, self.spades_exec, self.min_contig_len, - False) - spades_assembly.run() - spades_assemblies.append(spades_assembly) - print(spades_assembly.filtered_spades_assembly_file()+"\n") + False, + self.min_spades_contig_coverage) + final_spades_assembly.run() + spades_assemblies.append(final_spades_assembly) + print(final_spades_assembly.filtered_spades_assembly_file()+"\n") method_file = Methods(os.path.join(self.output_directory, 'methods_summary.txt'), trait_samples, nontrait_samples, self.min_kmers_threshold, self.min_contig_len, self.start_time, self.spades_exec) method_file.create_file() @@ -108,6 +113,9 @@ def run(self): kmc_complex.cleanup() + for kmc_fasta in kmc_fastas: + kmc_fasta.cleanup() + for kmc_filter in kmc_filters: kmc_filter.cleanup() diff --git a/plasmidtron/SpadesAssembly.py b/plasmidtron/SpadesAssembly.py index 4933770..f84f6d9 100644 --- a/plasmidtron/SpadesAssembly.py +++ b/plasmidtron/SpadesAssembly.py @@ -3,11 +3,12 @@ import tempfile import subprocess import shutil +import re from Bio import SeqIO '''Assemble a filtered sample with SPAdes''' class SpadesAssembly: - def __init__(self, sample, output_directory, threads, kmer, spades_exec, minimum_length, use_temp_directory): + def __init__(self, sample, output_directory, threads, kmer, spades_exec, minimum_length, use_temp_directory,min_spades_contig_coverage): self.logger = logging.getLogger(__name__) self.output_directory = output_directory self.sample = sample @@ -16,6 +17,7 @@ def __init__(self, sample, output_directory, threads, kmer, spades_exec, minimum self.minimum_length = minimum_length self.spades_exec = spades_exec self.use_temp_directory = use_temp_directory + self.min_spades_contig_coverage = min_spades_contig_coverage if not os.path.exists(self.output_directory): os.makedirs(self.output_directory) @@ -35,13 +37,29 @@ def spades_assembly_file(self): def filtered_spades_assembly_file(self): return os.path.join(self.spades_output_directory(), 'filtered_scaffolds.fasta') + + def sequence_coverage(self, contig_name): + # SPAdes add the coverage to the contig name. + #NODE_447_length_1644_cov_25.5669 + m = re.search('cov_([\d]+)', contig_name) + if m and m.group(0): + return m.group(1) + else: + # return a number big enough that it will always keep the contig + return self.min_spades_contig_coverage + 1 def remove_small_contigs(self,input_file, output_file): with open(input_file, "r") as spades_input_file, open(output_file, "w") as spades_output_file: sequences = [] for record in SeqIO.parse(spades_input_file, "fasta"): + if self.sequence_coverage(record.id) < self.min_spades_contig_coverage: + self.logger.info("Excluding contig with coverage of "+ str(self.sequence_coverage(record.id))+ " from "+ self.spades_assembly_file()) + continue + if len(record.seq) > self.minimum_length: sequences.append(record) + else: + self.logger.info("Excluding contig of length "+ str(len(record.seq))+ " from "+ self.spades_assembly_file() ) SeqIO.write(sequences, spades_output_file, "fasta") diff --git a/plasmidtron/tests/PlasmidTron_test.py b/plasmidtron/tests/PlasmidTron_test.py index 8f7f962..4cb5d6f 100644 --- a/plasmidtron/tests/PlasmidTron_test.py +++ b/plasmidtron/tests/PlasmidTron_test.py @@ -4,18 +4,19 @@ from plasmidtron.PlasmidTron import PlasmidTron class Options: - def __init__(self,output_directory, file_of_trait_fastqs, file_of_nontrait_fastqs, verbose, threads, kmer, min_kmers_threshold,max_kmers_threshold, spades_exec, min_contig_len, action): - self.output_directory = output_directory - self.file_of_trait_fastqs = file_of_trait_fastqs - self.file_of_nontrait_fastqs = file_of_nontrait_fastqs - self.verbose = verbose - self.threads = threads - self.kmer = kmer - self.min_kmers_threshold = min_kmers_threshold - self.spades_exec = spades_exec - self.min_contig_len = min_contig_len - self.max_kmers_threshold = max_kmers_threshold - self.action = action + def __init__(self,output_directory, file_of_trait_fastqs, file_of_nontrait_fastqs, verbose, threads, kmer, min_kmers_threshold,max_kmers_threshold, spades_exec, min_contig_len, action, min_spades_contig_coverage): + self.output_directory = output_directory + self.file_of_trait_fastqs = file_of_trait_fastqs + self.file_of_nontrait_fastqs = file_of_nontrait_fastqs + self.verbose = verbose + self.threads = threads + self.kmer = kmer + self.min_kmers_threshold = min_kmers_threshold + self.spades_exec = spades_exec + self.min_contig_len = min_contig_len + self.max_kmers_threshold = max_kmers_threshold + self.action = action + self.min_spades_contig_coverage = min_spades_contig_coverage test_modules_dir = os.path.dirname(os.path.realpath(__file__)) data_dir = os.path.join(test_modules_dir, 'data','plasmidtron') @@ -26,7 +27,7 @@ def test_small_valid_chrom_plasmid(self): '''Given small FASTQS of simulated reads, with a chromosome in 1 and chromosome+plasmid in the other run the whole pipeline''' if os.path.exists(os.path.join(data_dir,'out')): shutil.rmtree(os.path.join(data_dir,'out')) - options = Options(os.path.join(data_dir,'out'), os.path.join(data_dir,'traits.csv'), os.path.join(data_dir,'nontraits.csv'),True, 1, 61, 20,200, 'spades.py', 100,'union') + options = Options(os.path.join(data_dir,'out'), os.path.join(data_dir,'traits.csv'), os.path.join(data_dir,'nontraits.csv'),True, 1, 61, 20,200, 'spades.py', 100,'union', 1) plasmid_tron = PlasmidTron(options) plasmid_tron.run() diff --git a/plasmidtron/tests/SpadesAssembly_test.py b/plasmidtron/tests/SpadesAssembly_test.py index 9e9d9a4..7373167 100644 --- a/plasmidtron/tests/SpadesAssembly_test.py +++ b/plasmidtron/tests/SpadesAssembly_test.py @@ -11,7 +11,7 @@ class TestSpadesAssembly(unittest.TestCase): def test_filtering_small_contigs(self): sample = SampleData('/path/to/sample_1.fastq.gz','/path/to/sample_2.fastq.gz' ) - s = SpadesAssembly(sample, 'abc', 1, 1, '', 20, False ) + s = SpadesAssembly(sample, 'abc', 1, 1, '', 20, False, 0 ) s.remove_small_contigs(os.path.join(data_dir, 'assembly_with_small_contigs.fa'), os.path.join(data_dir, 'actual_assembly_without_small_contigs.fa')) with open(os.path.join(data_dir, 'actual_assembly_without_small_contigs.fa'), 'r') as actual_file, open(os.path.join(data_dir, 'expected_assembly_without_small_contigs.fa'), 'r') as expected_file: @@ -20,4 +20,17 @@ def test_filtering_small_contigs(self): self.assertEqual(actual_config_content,expected_config_content) os.remove(os.path.join(data_dir, 'actual_assembly_without_small_contigs.fa')) + + def test_filtering_low_coverage_contigs(self): + sample = SampleData('/path/to/sample_1.fastq.gz','/path/to/sample_2.fastq.gz' ) + + s = SpadesAssembly(sample, 'abc', 1, 1, '', 1, False, 24 ) + s.remove_small_contigs(os.path.join(data_dir, 'assembly_with_small_contigs.fa'), os.path.join(data_dir, 'actual_assembly_without_low_coverage_contigs.fa')) + + with open(os.path.join(data_dir, 'actual_assembly_without_low_coverage_contigs.fa'), 'r') as actual_file, open(os.path.join(data_dir, 'expected_assembly_coverage.fa'), 'r') as expected_file: + actual_config_content = actual_file.read() + expected_config_content = expected_file.read() + + self.assertEqual(actual_config_content,expected_config_content) + os.remove(os.path.join(data_dir, 'actual_assembly_without_low_coverage_contigs.fa')) \ No newline at end of file diff --git a/plasmidtron/tests/data/spadesassembly/assembly_with_small_contigs.fa b/plasmidtron/tests/data/spadesassembly/assembly_with_small_contigs.fa index f4fb23b..b316c32 100644 --- a/plasmidtron/tests/data/spadesassembly/assembly_with_small_contigs.fa +++ b/plasmidtron/tests/data/spadesassembly/assembly_with_small_contigs.fa @@ -1,12 +1,12 @@ ->small_contig1 +>NODE_1_length_9_cov_58.2272 AAAAAAAAA ->small_contig2 +>NODE_2_length_9_cov_54.8054 AAAAAAAAA ->large_contig3 +>NODE_3_length_24_cov_24.4284 CCCCCCCCCCCCCCCCCCCCCCCC ->small_contig4 +>NODE_4_length_9_cov_23.3323 AAAAAAAAA ->small_contig5 +>NODE_5_length_9_cov_22.1132 AAAAAAAAA ->large_contig6 +>NODE_6_length_24_cov_100.9852 CCCCCCCCCCCCCCCCCCCCCCCC diff --git a/plasmidtron/tests/data/spadesassembly/expected_assembly_covarage.fa b/plasmidtron/tests/data/spadesassembly/expected_assembly_covarage.fa new file mode 100644 index 0000000..ba9ba64 --- /dev/null +++ b/plasmidtron/tests/data/spadesassembly/expected_assembly_covarage.fa @@ -0,0 +1,8 @@ +>NODE_1_length_9_cov_58.2272 +AAAAAAAAA +>NODE_2_length_9_cov_54.8054 +AAAAAAAAA +>NODE_3_length_24_cov_24.4284 +CCCCCCCCCCCCCCCCCCCCCCCC +>NODE_6_length_24_cov_100.9852 +CCCCCCCCCCCCCCCCCCCCCCCC diff --git a/plasmidtron/tests/data/spadesassembly/expected_assembly_without_small_contigs.fa b/plasmidtron/tests/data/spadesassembly/expected_assembly_without_small_contigs.fa index f44f031..4982c31 100644 --- a/plasmidtron/tests/data/spadesassembly/expected_assembly_without_small_contigs.fa +++ b/plasmidtron/tests/data/spadesassembly/expected_assembly_without_small_contigs.fa @@ -1,4 +1,4 @@ ->large_contig3 +>NODE_3_length_24_cov_24.4284 CCCCCCCCCCCCCCCCCCCCCCCC ->large_contig6 +>NODE_6_length_24_cov_100.9852 CCCCCCCCCCCCCCCCCCCCCCCC diff --git a/scripts/plasmidtron b/scripts/plasmidtron index 8ab30d5..5c39531 100755 --- a/scripts/plasmidtron +++ b/scripts/plasmidtron @@ -23,7 +23,8 @@ parser.add_argument('file_of_trait_fastqs', help='File of filenames of trait (ca parser.add_argument('file_of_nontrait_fastqs', help='File of filenames of nontrait (control) FASTQs', type=InputTypes.is_file_of_nontrait_fastqs_valid) parser.add_argument('--action', '-a', help='Control how the traits kmers are filtered for assembly [%(default)s]', choices=['intersection','union'], default = 'union') parser.add_argument('--kmer', '-k', help='Kmer to use, depends on read length [%(default)s]', type=InputTypes.is_kmer_valid, default = 51) -parser.add_argument('--min_contig_len', '-l', help='Minimum contig length in final assembly [%(default)s]', type=InputTypes.is_min_contig_len_valid, default = 1600) +parser.add_argument('--min_contig_len', '-l', help='Minimum contig length in final assembly [%(default)s]', type=InputTypes.is_min_contig_len_valid, default = 3000) +parser.add_argument('--min_spades_contig_coverage', '-s', help='Filter out contigs with low coverage. Set to 0 to keep all. [%(default)s]', type=InputTypes.is_min_spades_contig_coverage_valid, default = 25) parser.add_argument('--min_kmers_threshold', '-m', help='Exclude k-mers occurring less than this [%(default)s]', type=InputTypes.is_min_kmers_threshold_valid, default = 25) parser.add_argument('--max_kmers_threshold', '-x', help='Exclude k-mers occurring more than this [%(default)s]', type=InputTypes.is_max_kmers_threshold_valid, default = 254) parser.add_argument('--threads', '-t', help='Number of threads [%(default)s]', type=InputTypes.is_threads_valid, default = 1)