Skip to content

Commit

Permalink
Warn on writing INSDC-noncompliant feature and qualifier keys (biopyt…
Browse files Browse the repository at this point in the history
…hon#4703)

See https://www.insdc.org/submitting-standards/feature-table/#3.1 for
what can and cannot be in an INSDC feature or qualifier key.
  • Loading branch information
michaelfm1211 authored Apr 15, 2024
1 parent 0bf3031 commit 5f25209
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 2 deletions.
33 changes: 32 additions & 1 deletion Bio/SeqIO/InsdcIO.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
import warnings

from datetime import datetime
from string import ascii_letters, digits

from Bio import BiopythonWarning
from Bio import SeqFeature
Expand All @@ -46,6 +47,9 @@
from .Interfaces import SequenceIterator
from .Interfaces import SequenceWriter

# Set containing all characters allowed in feature qualifier keys. See
# https://www.insdc.org/submitting-standards/feature-table/#3.1
_allowed_table_component_name_chars = set(ascii_letters + digits + "_-'*")

# NOTE
# ====
Expand Down Expand Up @@ -376,6 +380,19 @@ class _InsdcWriter(SequenceWriter):
)

def _write_feature_qualifier(self, key, value=None, quote=None):
if not _allowed_table_component_name_chars.issuperset(key):
warnings.warn(
f"Feature qualifier key '{key}' contains characters not"
" allowed by standard.",
BiopythonWarning,
)
if len(key) > 20:
warnings.warn(
f"Feature qualifier key '{key}' is longer than maximum length"
" specified by standard (20 characters).",
BiopythonWarning,
)

if value is None:
# Value-less entry like /pseudo
self.handle.write(f"{self.QUALIFIER_INDENT_STR}/{key}\n")
Expand Down Expand Up @@ -439,8 +456,22 @@ def _wrap_location(self, location):
def _write_feature(self, feature, record_length):
"""Write a single SeqFeature object to features table (PRIVATE)."""
assert feature.type, feature
location = _insdc_location_string(feature.location, record_length)

f_type = feature.type.replace(" ", "_")
if not _allowed_table_component_name_chars.issuperset(f_type):
warnings.warn(
f"Feature key '{f_type}' contains characters not allowed by"
" standard.",
BiopythonWarning,
)
if len(f_type) > 15:
warnings.warn(
f"Feature key '{f_type}' is longer than maximum length"
" specified by standard (15 characters).",
BiopythonWarning,
)

location = _insdc_location_string(feature.location, record_length)
line = (
(self.QUALIFIER_INDENT_TMP % f_type)[: self.QUALIFIER_INDENT]
+ self._wrap_location(location)
Expand Down
24 changes: 23 additions & 1 deletion Tests/test_SeqIO_Insdc.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from io import StringIO

from Bio import BiopythonParserWarning
from Bio import BiopythonWarning, BiopythonParserWarning
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqFeature import SimpleLocation
Expand Down Expand Up @@ -75,6 +75,28 @@ def test_writing_empty_qualifiers(self):
self.assertIn(" /one=1\n", gbk)
self.assertIn(' /text="blah"\n', gbk)

def test_warn_on_writing_nonstandard_feature_key(self):
f = SeqFeature(
SimpleLocation(5, 20, strand=+1),
type="a" * 16,
qualifiers={"empty": None, "zero": 0, "one": 1, "text": "blah"},
)
record = SeqRecord(Seq("A" * 100), "dummy", features=[f])
record.annotations["molecule_type"] = "DNA"
with self.assertWarns(BiopythonWarning):
record.format("gb")

def test_warn_on_writing_nonstandard_qualifier_key(self):
f = SeqFeature(
SimpleLocation(5, 20, strand=+1),
type="region",
qualifiers={"a" * 21: "test"},
)
record = SeqRecord(Seq("A" * 100), "dummy", features=[f])
record.annotations["molecule_type"] = "DNA"
with self.assertWarns(BiopythonWarning):
record.format("gb")


class TestEmblRewrite(SeqRecordTestBaseClass):
def check_rewrite(self, filename):
Expand Down

0 comments on commit 5f25209

Please sign in to comment.