Skip to content

Commit

Permalink
Closes #29: Support ambiguity codes in translation
Browse files Browse the repository at this point in the history
This PR adds basic support for ambiguity codes. Issue #31 proposes improved support at a later date.
  • Loading branch information
reece authored Apr 14, 2021
2 parents 3a03aa0 + 5d7484b commit c5ea54e
Show file tree
Hide file tree
Showing 5 changed files with 45 additions and 28 deletions.
2 changes: 1 addition & 1 deletion src/bioutils/cytobands.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def get_cytoband_names():
list of str: The names of the available cytobands.
Examples:
>>> get_cytoband_names()
>>> sorted(get_cytoband_names())
['ucsc-hg19', 'ucsc-hg38']
"""

Expand Down
19 changes: 18 additions & 1 deletion src/bioutils/sequences.py
Original file line number Diff line number Diff line change
Expand Up @@ -420,6 +420,18 @@ def translate_cds(seq, full_codons=True, ter_symbol="*"):
>>> translate_cds("AUGCG", full_codons=False)
'M*'
>>> translate_cds("ATGTAN")
'MX'
>>> translate_cds("CCN")
'X'
>>> translate_cds("TRA")
'X'
>>> translate_cds("TRATA", full_codons=False)
'X*'
>>> translate_cds("AUGCGQ")
Traceback (most recent call last):
...
Expand All @@ -441,7 +453,12 @@ def translate_cds(seq, full_codons=True, ter_symbol="*"):
protein_seq = list()
for i in range(0, len(seq) - len(seq) % 3, 3):
try:
aa = dna_to_aa1_lut[seq[i:i + 3]]
codon = seq[i:i + 3]
iupac_ambiguity_codes = "BDHVNUWSMKRYZ"
if any([iupac_ambiguity_code in codon for iupac_ambiguity_code in iupac_ambiguity_codes]):
aa = "X"
else:
aa = dna_to_aa1_lut[codon]
except KeyError:
raise ValueError("Codon {} at position {}..{} is undefined in codon table".format(
seq[i:i + 3], i+1, i+3))
Expand Down
3 changes: 1 addition & 2 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import pytest
import vcr


# set vcr logging level
logging.basicConfig()
logger = logging.getLogger("vcr")
Expand All @@ -19,5 +18,5 @@
filter_headers=["Authorization"],
filter_post_data_parameters=["Authorization"],
record_mode=os.environ.get("VCR_RECORD_MODE", "once"),
)
)
vcr.use_cassette = vcr.default_vcr.use_cassette
40 changes: 20 additions & 20 deletions tests/test_normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,8 @@

import pytest


sequence = "CCCCCCCCACACACACACTAGCAGCAGCA"


normalize_seq = partial(normalize, sequence=sequence)
normalize_trim = partial(normalize_seq, mode=NormalizationMode.TRIMONLY)
normalize_left = partial(normalize_seq, mode=NormalizationMode.LEFTSHUFFLE)
Expand All @@ -17,26 +15,26 @@


def test_trim():
assert ((25, 25), ('', 'AC')) == normalize_trim(interval=(22,25), alleles=(None, "AGCAC"))
assert ((24, 25), ('C', '', 'CAC')) == normalize_trim(interval=(22,25), alleles=(None, "AG", "AGCAC"))
assert ((23, 24), ('G', '', 'GCA')) == normalize_trim(interval=(22,25), alleles=(None, "AC", "AGCAC"))
assert ((22, 24), ('AG', 'G', 'AGCA')) == normalize_trim(interval=(22,25), alleles=(None, "GC", "AGCAC"))
assert ((25, 25), ('', 'AC')) == normalize_trim(interval=(22, 25), alleles=(None, "AGCAC"))
assert ((24, 25), ('C', '', 'CAC')) == normalize_trim(interval=(22, 25), alleles=(None, "AG", "AGCAC"))
assert ((23, 24), ('G', '', 'GCA')) == normalize_trim(interval=(22, 25), alleles=(None, "AC", "AGCAC"))
assert ((22, 24), ('AG', 'G', 'AGCA')) == normalize_trim(interval=(22, 25), alleles=(None, "GC", "AGCAC"))


def test_anchor():
assert ((23, 25), ('GC', '')) == normalize_trim(interval=(22,25), alleles=(None, "A"), anchor_length=0)
assert ((22, 26), ('AGCA', 'AA')) == normalize_trim(interval=(22,25), alleles=(None, "A"), anchor_length=1)
assert ((21, 27), ('CAGCAG', 'CAAG')) == normalize_trim(interval=(22,25), alleles=(None, "A"), anchor_length=2)
assert ((23, 25), ('GC', '')) == normalize_trim(interval=(22, 25), alleles=(None, "A"), anchor_length=0)
assert ((22, 26), ('AGCA', 'AA')) == normalize_trim(interval=(22, 25), alleles=(None, "A"), anchor_length=1)
assert ((21, 27), ('CAGCAG', 'CAAG')) == normalize_trim(interval=(22, 25), alleles=(None, "A"), anchor_length=2)

# off the left
assert ((1, 1), ('', 'C')) == normalize_trim(interval=(1,1), alleles=(None, "C"), anchor_length=0)
assert ((0, 2), ('CC', 'CCC')) == normalize_trim(interval=(1,1), alleles=(None, "C"), anchor_length=1)
assert ((0, 3), ('CCC', 'CCCC')) == normalize_trim(interval=(1,1), alleles=(None, "C"), anchor_length=2)
assert ((1, 1), ('', 'C')) == normalize_trim(interval=(1, 1), alleles=(None, "C"), anchor_length=0)
assert ((0, 2), ('CC', 'CCC')) == normalize_trim(interval=(1, 1), alleles=(None, "C"), anchor_length=1)
assert ((0, 3), ('CCC', 'CCCC')) == normalize_trim(interval=(1, 1), alleles=(None, "C"), anchor_length=2)

# off the right
assert ((28, 28), ('', 'C')) == normalize_trim(interval=(28,28), alleles=(None, "C"), anchor_length=0)
assert ((27, 29), ('CA', 'CCA')) == normalize_trim(interval=(28,28), alleles=(None, "C"), anchor_length=1)
assert ((26, 29), ('GCA', 'GCCA')) == normalize_trim(interval=(28,28), alleles=(None, "C"), anchor_length=2)
assert ((28, 28), ('', 'C')) == normalize_trim(interval=(28, 28), alleles=(None, "C"), anchor_length=0)
assert ((27, 29), ('CA', 'CCA')) == normalize_trim(interval=(28, 28), alleles=(None, "C"), anchor_length=1)
assert ((26, 29), ('GCA', 'GCCA')) == normalize_trim(interval=(28, 28), alleles=(None, "C"), anchor_length=2)


def test_trinuc():
Expand All @@ -54,15 +52,17 @@ def test_trinuc():

def test_bounds():
"""ensure that bounds are honored"""
assert ((20, 24), ('GCAG', 'GCAGCAG')) == normalize_expand(interval=(22, 22), alleles=(None, 'AGC'), bounds=(20,24))
assert ((20, 24), ('GCAG', 'GCAGCAG')) == normalize_expand(interval=(22, 22),
alleles=(None, 'AGC'),
bounds=(20, 24))


#TODO: def test_multiallele():


def test_mode_string():
"test that mode as string is accepted"
_normalize = partial(normalize_seq, interval=(28,28), alleles=(None, "C"))
_normalize = partial(normalize_seq, interval=(28, 28), alleles=(None, "C"))
vcf_out = ((26, 27), ('G', 'GC'))
assert vcf_out != _normalize(), "not VCF output by default"
assert vcf_out == _normalize(mode="VCF"), "mode as string recognized"
Expand All @@ -71,17 +71,17 @@ def test_mode_string():
def test_input_alleles_not_modified():
"""ensure that alleles list is not modified"""
alleles = (None, "AGCAC")
normalize_trim(interval=(22,25), alleles=alleles)
normalize_trim(interval=(22, 25), alleles=alleles)
assert (None, "AGCAC") == alleles


def test_error_distinct():
"""Must have at least two distinct allele sequences (incl. ref) to normalize"""
with pytest.raises(ValueError):
normalize_trim(interval=(22,25), alleles=(None, "AGC"))
normalize_trim(interval=(22, 25), alleles=(None, "AGC"))


def test_error_ref_allele():
"First allele is ref allele and must be None"
with pytest.raises(ValueError):
normalize_trim(interval=(22,25), alleles=("foo", "AGC"))
normalize_trim(interval=(22, 25), alleles=("foo", "AGC"))
9 changes: 5 additions & 4 deletions tests/test_seqfetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
def test_fetch_seq():
assert 1596 == len(fetch_seq('NP_056374.2'))

assert 'MESRETLSSS' == fetch_seq('NP_056374.2',0,10)
assert 'MESRETLSSS' == fetch_seq('NP_056374.2')[0:10] # NOT RECOMMENDED
assert 'MESRETLSSS' == fetch_seq('NP_056374.2', 0, 10)
assert 'MESRETLSSS' == fetch_seq('NP_056374.2')[0:10] # NOT RECOMMENDED

assert 'ATCACACGTGCAGGAACCCTTTTCC' == fetch_seq('NC_000001.10', 2000000, 2000025)
assert 'AAAATTAAATTAAAATAAATAAAAA' == fetch_seq('NG_032072.1', 0, 25)
Expand Down Expand Up @@ -58,8 +58,9 @@ def test_fetch_seq_errors():

def _check1(_x):
# small, fast query
assert 'MESRETLSSS' == fetch_seq('NP_056374.2',0,10)

assert 'MESRETLSSS' == fetch_seq('NP_056374.2', 0, 10)


# no vcr!
@pytest.mark.network
def test_rate_limit():
Expand Down

0 comments on commit c5ea54e

Please sign in to comment.