Closes #29: Support ambiguity codes in translation

This PR adds basic support for ambiguity codes. Issue #31 proposes improved support at a later date.
biocommons · Apr 14, 2021 · c5ea54e · c5ea54e
2 parents 3a03aa0 + 5d7484b
commit c5ea54e
Show file tree

Hide file tree

Showing 5 changed files with 45 additions and 28 deletions.
diff --git a/src/bioutils/cytobands.py b/src/bioutils/cytobands.py
@@ -17,7 +17,7 @@ def get_cytoband_names():
         list of str: The names of the available cytobands. 
 
     Examples:
-        >>> get_cytoband_names()  
+        >>> sorted(get_cytoband_names())
         ['ucsc-hg19', 'ucsc-hg38']
     """
 

diff --git a/src/bioutils/sequences.py b/src/bioutils/sequences.py
@@ -420,6 +420,18 @@ def translate_cds(seq, full_codons=True, ter_symbol="*"):
         >>> translate_cds("AUGCG", full_codons=False)
         'M*'
 
+        >>> translate_cds("ATGTAN")
+        'MX'
+
+        >>> translate_cds("CCN")
+        'X'
+
+        >>> translate_cds("TRA")
+        'X'
+
+        >>> translate_cds("TRATA", full_codons=False)
+        'X*'
+
         >>> translate_cds("AUGCGQ")
         Traceback (most recent call last):
         ...
@@ -441,7 +453,12 @@ def translate_cds(seq, full_codons=True, ter_symbol="*"):
     protein_seq = list()
     for i in range(0, len(seq) - len(seq) % 3, 3):
         try:
-            aa = dna_to_aa1_lut[seq[i:i + 3]]
+            codon = seq[i:i + 3]
+            iupac_ambiguity_codes = "BDHVNUWSMKRYZ"
+            if any([iupac_ambiguity_code in codon for iupac_ambiguity_code in iupac_ambiguity_codes]):
+                aa = "X"
+            else:
+                aa = dna_to_aa1_lut[codon]
         except KeyError:
             raise ValueError("Codon {} at position {}..{} is undefined in codon table".format(
                 seq[i:i + 3], i+1, i+3))

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -4,7 +4,6 @@
 import pytest
 import vcr
 
-
 # set vcr logging level
 logging.basicConfig()
 logger = logging.getLogger("vcr")
@@ -19,5 +18,5 @@
     filter_headers=["Authorization"],
     filter_post_data_parameters=["Authorization"],
     record_mode=os.environ.get("VCR_RECORD_MODE", "once"),
-    )
+)
 vcr.use_cassette = vcr.default_vcr.use_cassette
diff --git a/tests/test_normalize.py b/tests/test_normalize.py
@@ -4,10 +4,8 @@
 
 import pytest
 
-
 sequence = "CCCCCCCCACACACACACTAGCAGCAGCA"
 
-
 normalize_seq = partial(normalize, sequence=sequence)
 normalize_trim = partial(normalize_seq, mode=NormalizationMode.TRIMONLY)
 normalize_left = partial(normalize_seq, mode=NormalizationMode.LEFTSHUFFLE)
@@ -17,26 +15,26 @@
 
 
 def test_trim():
-    assert ((25, 25), ('', 'AC')) == normalize_trim(interval=(22,25), alleles=(None, "AGCAC"))
-    assert ((24, 25), ('C', '', 'CAC')) == normalize_trim(interval=(22,25), alleles=(None, "AG", "AGCAC"))
-    assert ((23, 24), ('G', '', 'GCA')) == normalize_trim(interval=(22,25), alleles=(None, "AC", "AGCAC"))
-    assert ((22, 24), ('AG', 'G', 'AGCA')) == normalize_trim(interval=(22,25), alleles=(None, "GC", "AGCAC"))
+    assert ((25, 25), ('', 'AC')) == normalize_trim(interval=(22, 25), alleles=(None, "AGCAC"))
+    assert ((24, 25), ('C', '', 'CAC')) == normalize_trim(interval=(22, 25), alleles=(None, "AG", "AGCAC"))
+    assert ((23, 24), ('G', '', 'GCA')) == normalize_trim(interval=(22, 25), alleles=(None, "AC", "AGCAC"))
+    assert ((22, 24), ('AG', 'G', 'AGCA')) == normalize_trim(interval=(22, 25), alleles=(None, "GC", "AGCAC"))
 
 
 def test_anchor():
-    assert ((23, 25), ('GC', '')) == normalize_trim(interval=(22,25), alleles=(None, "A"), anchor_length=0)
-    assert ((22, 26), ('AGCA', 'AA')) == normalize_trim(interval=(22,25), alleles=(None, "A"), anchor_length=1)
-    assert ((21, 27), ('CAGCAG', 'CAAG')) == normalize_trim(interval=(22,25), alleles=(None, "A"), anchor_length=2)
+    assert ((23, 25), ('GC', '')) == normalize_trim(interval=(22, 25), alleles=(None, "A"), anchor_length=0)
+    assert ((22, 26), ('AGCA', 'AA')) == normalize_trim(interval=(22, 25), alleles=(None, "A"), anchor_length=1)
+    assert ((21, 27), ('CAGCAG', 'CAAG')) == normalize_trim(interval=(22, 25), alleles=(None, "A"), anchor_length=2)
 
     # off the left
-    assert ((1, 1), ('', 'C')) == normalize_trim(interval=(1,1), alleles=(None, "C"), anchor_length=0)
-    assert ((0, 2), ('CC', 'CCC')) == normalize_trim(interval=(1,1), alleles=(None, "C"), anchor_length=1)
-    assert ((0, 3), ('CCC', 'CCCC')) == normalize_trim(interval=(1,1), alleles=(None, "C"), anchor_length=2)
+    assert ((1, 1), ('', 'C')) == normalize_trim(interval=(1, 1), alleles=(None, "C"), anchor_length=0)
+    assert ((0, 2), ('CC', 'CCC')) == normalize_trim(interval=(1, 1), alleles=(None, "C"), anchor_length=1)
+    assert ((0, 3), ('CCC', 'CCCC')) == normalize_trim(interval=(1, 1), alleles=(None, "C"), anchor_length=2)
 
     # off the right
-    assert ((28, 28), ('', 'C')) == normalize_trim(interval=(28,28), alleles=(None, "C"), anchor_length=0)
-    assert ((27, 29), ('CA', 'CCA')) == normalize_trim(interval=(28,28), alleles=(None, "C"), anchor_length=1)
-    assert ((26, 29), ('GCA', 'GCCA')) == normalize_trim(interval=(28,28), alleles=(None, "C"), anchor_length=2)
+    assert ((28, 28), ('', 'C')) == normalize_trim(interval=(28, 28), alleles=(None, "C"), anchor_length=0)
+    assert ((27, 29), ('CA', 'CCA')) == normalize_trim(interval=(28, 28), alleles=(None, "C"), anchor_length=1)
+    assert ((26, 29), ('GCA', 'GCCA')) == normalize_trim(interval=(28, 28), alleles=(None, "C"), anchor_length=2)
 
 
 def test_trinuc():
@@ -54,15 +52,17 @@ def test_trinuc():
 
 def test_bounds():
     """ensure that bounds are honored"""
-    assert ((20, 24), ('GCAG', 'GCAGCAG')) == normalize_expand(interval=(22, 22), alleles=(None, 'AGC'), bounds=(20,24))
+    assert ((20, 24), ('GCAG', 'GCAGCAG')) == normalize_expand(interval=(22, 22),
+                                                               alleles=(None, 'AGC'),
+                                                               bounds=(20, 24))
 
 
 #TODO: def test_multiallele():
 
 
 def test_mode_string():
     "test that mode as string is accepted"
-    _normalize = partial(normalize_seq, interval=(28,28), alleles=(None, "C"))
+    _normalize = partial(normalize_seq, interval=(28, 28), alleles=(None, "C"))
     vcf_out = ((26, 27), ('G', 'GC'))
     assert vcf_out != _normalize(), "not VCF output by default"
     assert vcf_out == _normalize(mode="VCF"), "mode as string recognized"
@@ -71,17 +71,17 @@ def test_mode_string():
 def test_input_alleles_not_modified():
     """ensure that alleles list is not modified"""
     alleles = (None, "AGCAC")
-    normalize_trim(interval=(22,25), alleles=alleles)
+    normalize_trim(interval=(22, 25), alleles=alleles)
     assert (None, "AGCAC") == alleles
 
 
 def test_error_distinct():
     """Must have at least two distinct allele sequences (incl. ref) to normalize"""
     with pytest.raises(ValueError):
-        normalize_trim(interval=(22,25), alleles=(None, "AGC"))
+        normalize_trim(interval=(22, 25), alleles=(None, "AGC"))
 
 
 def test_error_ref_allele():
     "First allele is ref allele and must be None"
     with pytest.raises(ValueError):
-        normalize_trim(interval=(22,25), alleles=("foo", "AGC"))
+        normalize_trim(interval=(22, 25), alleles=("foo", "AGC"))
diff --git a/tests/test_seqfetcher.py b/tests/test_seqfetcher.py
@@ -11,8 +11,8 @@
 def test_fetch_seq():
     assert 1596 == len(fetch_seq('NP_056374.2'))
 
-    assert 'MESRETLSSS' == fetch_seq('NP_056374.2',0,10)
-    assert 'MESRETLSSS' == fetch_seq('NP_056374.2')[0:10]  # NOT RECOMMENDED
+    assert 'MESRETLSSS' == fetch_seq('NP_056374.2', 0, 10)
+    assert 'MESRETLSSS' == fetch_seq('NP_056374.2')[0:10]    # NOT RECOMMENDED
 
     assert 'ATCACACGTGCAGGAACCCTTTTCC' == fetch_seq('NC_000001.10', 2000000, 2000025)
     assert 'AAAATTAAATTAAAATAAATAAAAA' == fetch_seq('NG_032072.1', 0, 25)
@@ -58,8 +58,9 @@ def test_fetch_seq_errors():
 
 def _check1(_x):
     # small, fast query
-    assert 'MESRETLSSS' == fetch_seq('NP_056374.2',0,10)
-
+    assert 'MESRETLSSS' == fetch_seq('NP_056374.2', 0, 10)
+
+
 # no vcr!
 @pytest.mark.network
 def test_rate_limit():