Skip to content

Commit

Permalink
Merge pull request #70 from martinghunt/embl_no_cds_fix
Browse files Browse the repository at this point in the history
Embl no cds fix
  • Loading branch information
martinghunt authored Jan 12, 2017
2 parents 8297d4b + b615a85 commit f6108ea
Show file tree
Hide file tree
Showing 7 changed files with 131 additions and 3 deletions.
2 changes: 1 addition & 1 deletion iva/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import os
import sys
import subprocess
version = '1.0.7'
version = '1.0.8'

class abspathAction(argparse.Action):
def __call__(self, parser, namespace, value, option_string):
Expand Down
16 changes: 16 additions & 0 deletions iva/kraken.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,18 @@ def __init__(self, rootdir, extra_refs_file=None, threads=1, minimizer_len=13, m
self.done_files = {x:os.path.join(self.rootdir, 'progress.' + x + '.done') for x in self.tasks}


@classmethod
def count_cds_from_embl(cls, infile):
count = 0

with open(infile) as f:
for line in f:
if line.startswith('FT CDS '):
count += 1

return count


def _mkdir(self, d, rmtree=False):
if rmtree and os.path.exists(d):
shutil.rmtree(d)
Expand Down Expand Up @@ -253,6 +265,10 @@ def _sort_out_extra_refs(self):
self._replace_fasta_header(fa_file, 'gi|' + str(new_gi) + '|x')
embl_file = os.path.join(embl_dir, gi + '.embl')
self._genbank2embl(gb_file, embl_file)
number_of_cds = Database.count_cds_from_embl(embl_file)
print('GI', gi, ' CDS:', number_of_cds)
if number_of_cds < 1:
raise Error('No CDS found for GI ' + gi + '. Can only use references that have at least one CDS. Cannot continue.')

self._get_parent_taxons(real_taxon_ids)

Expand Down
27 changes: 27 additions & 0 deletions iva/tests/data/kraken_count_cds_from_embl.0
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
ID NC_123456; SV 1; linear; unassigned DNA; STD; VRL; 100 BP.
XX
AC NC_123456;
XX
DT 25-MAY-2001
XX
DE Hitchhiker virus 42, complete genome.
XX
KW DASeq
XX
DR BioProject; PRJNA12345.
XX
FH Key Location/Qualifiers
FH
FT source 1..100
FT /mol_type="alien RNA"
FT /db_xref="taxon:424242"
FT /organism="Unknown"
FT gene 42..52
FT /locus_tag="Vogon1"
FT /db_xref="GeneID:42"
FT /gene="poetic"
XX
SQ Sequence 100 BP; 42 A; 42 C; 42 G; 42 T; 0 other;
gctatgctga caggtacgta cgcgcgcgtc gcagtcagcg tcgatgtccct cagtctgcga 60
cgatcgtagc cagttgcggc ccccctctct tatatataaa 100
//
34 changes: 34 additions & 0 deletions iva/tests/data/kraken_count_cds_from_embl.1
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
ID NC_123456; SV 1; linear; unassigned DNA; STD; VRL; 100 BP.
XX
AC NC_123456;
XX
DT 25-MAY-2001
XX
DE Hitchhiker virus 42, complete genome.
XX
KW DASeq
XX
DR BioProject; PRJNA12345.
XX
FH Key Location/Qualifiers
FH
FT source 1..100
FT /mol_type="alien RNA"
FT /db_xref="taxon:424242"
FT /organism="Unknown"
FT gene 42..52
FT /locus_tag="Vogon1"
FT /db_xref="GeneID:42"
FT /gene="poetic"
FT CDS join(42..44,47..52)
FT /locus_tag="Vogon1"
FT /protein_id="43"
FT /gene="poetic"
FT /note="do not listen to this gene"
FT /codon_start=1
FT /product="poem"
XX
SQ Sequence 100 BP; 42 A; 42 C; 42 G; 42 T; 0 other;
gctatgctga caggtacgta cgcgcgcgtc gcagtcagcg tcgatgtccct cagtctgcga 60
cgatcgtagc cagttgcggc ccccctctct tatatataaa 100
//
44 changes: 44 additions & 0 deletions iva/tests/data/kraken_count_cds_from_embl.2
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
ID NC_123456; SV 1; linear; unassigned DNA; STD; VRL; 100 BP.
XX
AC NC_123456;
XX
DT 25-MAY-2001
XX
DE Hitchhiker virus 42, complete genome.
XX
KW DASeq
XX
DR BioProject; PRJNA12345.
XX
FH Key Location/Qualifiers
FH
FT source 1..100
FT /mol_type="alien RNA"
FT /db_xref="taxon:424242"
FT /organism="Unknown"
FT gene 11..13
FT /locus_tag="ShortyMcShortGene"
FT /db_xref="GeneID:1"
FT /gene="tiny"
FT CDS join(11..13)
FT /locus_tag="short_tag"
FT /protein_id="11"
FT /gene="small"
FT /codon_start=1
FT /product="not a lot"
FT gene 42..52
FT /locus_tag="Vogon1"
FT /db_xref="GeneID:42"
FT /gene="poetic"
FT CDS join(42..44,47..52)
FT /locus_tag="Vogon1"
FT /protein_id="43"
FT /gene="poetic"
FT /note="do not listen to this gene"
FT /codon_start=1
FT /product="poem"
XX
SQ Sequence 100 BP; 42 A; 42 C; 42 G; 42 T; 0 other;
gctatgctga caggtacgta cgcgcgcgtc gcagtcagcg tcgatgtccct cagtctgcga 60
cgatcgtagc cagttgcggc ccccctctct tatatataaa 100
//
9 changes: 8 additions & 1 deletion iva/tests/kraken_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,13 @@ def setUp(self):
self.db = kraken.Database(os.path.join(data_dir, 'kraken_test.db'))


def test_count_cds_from_embl(self):
'''test count_cds_from_embl'''
for i in range(3):
infile = os.path.join(data_dir, 'kraken_count_cds_from_embl.' + str(i))
self.assertEqual(i, kraken.Database.count_cds_from_embl(infile))


def test_get_parent_taxons(self):
'''test _get_parent_taxons'''
taxons = set(['1', '9', '13'])
Expand Down Expand Up @@ -87,7 +94,7 @@ def test_append_to_file(self):
self.db._append_to_file(tmp, '42')
self.assertTrue(filecmp.cmp(tmp, after))
os.unlink(tmp)


def test_species_to_dir(self):
'''test species_to_dir'''
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@

setup(
name='iva',
version='1.0.7',
version='1.0.8',
description='Iterative Virus Assembler',
packages = find_packages(),
package_data={'iva': ['gage/*', 'ratt/*', 'read_trim/*', 'test_run_data/*']},
Expand Down

0 comments on commit f6108ea

Please sign in to comment.