From ea8ff473e33f543a59d0ba66d8d01105c5a91a04 Mon Sep 17 00:00:00 2001 From: mdehoon Date: Wed, 18 Sep 2024 18:04:06 +0900 Subject: [PATCH] Allow parsers in Bio.SeqIO to handle both text and binary modes (#4842) --- Bio/SeqIO/AbiIO.py | 4 +++- Bio/SeqIO/AceIO.py | 6 ++++-- Bio/SeqIO/FastaIO.py | 16 ++++++++++++---- Bio/SeqIO/GckIO.py | 4 +++- Bio/SeqIO/GfaIO.py | 8 ++++++-- Bio/SeqIO/IgIO.py | 4 +++- Bio/SeqIO/InsdcIO.py | 20 ++++++++++++++----- Bio/SeqIO/Interfaces.py | 36 ++++++++++++++++++++++++++--------- Bio/SeqIO/NibIO.py | 4 +++- Bio/SeqIO/PdbIO.py | 10 +++++++++- Bio/SeqIO/PhdIO.py | 4 +++- Bio/SeqIO/PirIO.py | 4 +++- Bio/SeqIO/QualityIO.py | 8 ++++++-- Bio/SeqIO/SeqXmlIO.py | 4 +++- Bio/SeqIO/SffIO.py | 4 +++- Bio/SeqIO/SnapGeneIO.py | 4 +++- Bio/SeqIO/SwissIO.py | 4 +++- Bio/SeqIO/TabIO.py | 4 +++- Bio/SeqIO/TwoBitIO.py | 4 +++- Bio/SeqIO/UniprotIO.py | 17 ++++++++++++++++- Bio/SeqIO/XdnaIO.py | 4 +++- DEPRECATED.rst | 8 ++++++++ Tests/test_SeqIO_UniprotIO.py | 12 ++++++++++-- 23 files changed, 152 insertions(+), 41 deletions(-) diff --git a/Bio/SeqIO/AbiIO.py b/Bio/SeqIO/AbiIO.py index 3c32f8b2ced..ad412efab03 100644 --- a/Bio/SeqIO/AbiIO.py +++ b/Bio/SeqIO/AbiIO.py @@ -347,9 +347,11 @@ def _get_string_tag(opt_bytes_value, default=None): class AbiIterator(SequenceIterator): """Parser for Abi files.""" + modes = "b" + def __init__(self, source, trim=False): """Return an iterator for the Abi file format.""" - super().__init__(source, mode="b", fmt="ABI") + super().__init__(source, fmt="ABI") # check if input file is a valid Abi file marker = self.stream.read(4) if not marker: diff --git a/Bio/SeqIO/AceIO.py b/Bio/SeqIO/AceIO.py index 63fea2b3075..e9cd9bd3c1f 100644 --- a/Bio/SeqIO/AceIO.py +++ b/Bio/SeqIO/AceIO.py @@ -22,6 +22,8 @@ class AceIterator(SequenceIterator): """Return SeqRecord objects from an ACE file.""" + modes = "t" + def __init__( self, source: _TextIOSource, @@ -69,8 +71,8 @@ def __init__( 90 """ - super().__init__(source, mode="t", fmt="ACE") - self.ace_contigs = Ace._parse(self.stream) + super().__init__(source, fmt="ACE") + self.ace_contigs = Ace.parse(self.stream) def __next__(self): try: diff --git a/Bio/SeqIO/FastaIO.py b/Bio/SeqIO/FastaIO.py index a1489499a0f..aecd4e702f9 100644 --- a/Bio/SeqIO/FastaIO.py +++ b/Bio/SeqIO/FastaIO.py @@ -143,6 +143,8 @@ def FastaTwoLineParser(handle): class FastaIterator(SequenceIterator): """Parser for plain Fasta files without comments.""" + modes = "t" + def __init__( self, source: _TextIOSource, @@ -191,7 +193,7 @@ def __init__( """ if alphabet is not None: raise ValueError("The alphabet argument is no longer supported") - super().__init__(source, mode="t", fmt="Fasta") + super().__init__(source, fmt="Fasta") try: line = next(self.stream) except StopIteration: @@ -266,6 +268,8 @@ def __next__(self): class FastaTwoLineIterator(SequenceIterator): """Parser for Fasta files with exactly two lines per record.""" + modes = "t" + def __init__(self, source): """Iterate over two-line Fasta records (as SeqRecord objects). @@ -278,7 +282,7 @@ def __init__(self, source): Only the default title to ID/name/description parsing offered by the relaxed FASTA parser is offered. """ - super().__init__(source, mode="t", fmt="FASTA") + super().__init__(source, fmt="FASTA") self._data = FastaTwoLineParser(self.stream) def __next__(self): @@ -300,6 +304,8 @@ def __next__(self): class FastaBlastIterator(SequenceIterator): """Parser for Fasta files, allowing for comments as in BLAST.""" + modes = "t" + def __init__( self, source: _TextIOSource, @@ -348,7 +354,7 @@ def __init__( """ if alphabet is not None: raise ValueError("The alphabet argument is no longer supported") - super().__init__(source, mode="t", fmt="FASTA") + super().__init__(source, fmt="FASTA") for line in self.stream: if line[0] not in "#!;": if not line.startswith(">"): @@ -397,6 +403,8 @@ def __next__(self): class FastaPearsonIterator(SequenceIterator): """Parser for Fasta files, allowing for comments as in the FASTA aligner.""" + modes = "t" + def __init__( self, source: _TextIOSource, @@ -446,7 +454,7 @@ def __init__( """ if alphabet is not None: raise ValueError("The alphabet argument is no longer supported") - super().__init__(source, mode="t", fmt="Fasta") + super().__init__(source, fmt="Fasta") for line in self.stream: if line.startswith(">"): self._line = line diff --git a/Bio/SeqIO/GckIO.py b/Bio/SeqIO/GckIO.py index f2a550c05d5..118a1c03412 100644 --- a/Bio/SeqIO/GckIO.py +++ b/Bio/SeqIO/GckIO.py @@ -73,13 +73,15 @@ def _read_p4string(stream): class GckIterator(SequenceIterator): """Parser for GCK files.""" + modes = "b" + def __init__(self, source): """Break up a GCK file into SeqRecord objects. Note that a GCK file can only contain one sequence, so this iterator will always return a single record. """ - super().__init__(source, mode="b", fmt="GCK") + super().__init__(source, fmt="GCK") # Skip file header # GCK files start with a 24-bytes header. Bytes 4 and 8 seem to # always be 12, maybe this could act as a magic cookie. Bytes diff --git a/Bio/SeqIO/GfaIO.py b/Bio/SeqIO/GfaIO.py index e50a3eca36b..6098a8826bc 100644 --- a/Bio/SeqIO/GfaIO.py +++ b/Bio/SeqIO/GfaIO.py @@ -119,6 +119,8 @@ class Gfa1Iterator(SequenceIterator): Documentation: https://gfa-spec.github.io/GFA-spec/GFA1.html """ + modes = "t" + def __init__( self, source: _TextIOSource, @@ -128,7 +130,7 @@ def __init__( Arguments: - source - input stream opened in text mode, or a path to a file """ - super().__init__(source, mode="t", fmt="GFA 1.0") + super().__init__(source, fmt="GFA 1.0") def __next__(self): for line in self.stream: @@ -164,6 +166,8 @@ class Gfa2Iterator(SequenceIterator): Documentation for version 2: https://gfa-spec.github.io/GFA-spec/GFA2.html """ + modes = "t" + def __init__( self, source: _TextIOSource, @@ -173,7 +177,7 @@ def __init__( Arguments: - source - input stream opened in text mode, or a path to a file """ - super().__init__(source, mode="t", fmt="GFA 2.0") + super().__init__(source, fmt="GFA 2.0") def __next__(self): for line in self.stream: diff --git a/Bio/SeqIO/IgIO.py b/Bio/SeqIO/IgIO.py index a12ad4a77d8..91377565b5d 100644 --- a/Bio/SeqIO/IgIO.py +++ b/Bio/SeqIO/IgIO.py @@ -22,6 +22,8 @@ class IgIterator(SequenceIterator): """Parser for IntelliGenetics files.""" + modes = "t" + def __init__(self, source): """Iterate over IntelliGenetics records (as SeqRecord objects). @@ -60,7 +62,7 @@ def __init__(self, source): SYK_SYK length 330 """ - super().__init__(source, mode="t", fmt="IntelliGenetics") + super().__init__(source, fmt="IntelliGenetics") for line in self.stream: if not line.startswith(";;"): break diff --git a/Bio/SeqIO/InsdcIO.py b/Bio/SeqIO/InsdcIO.py index 4d05bb33402..b892dbbf142 100644 --- a/Bio/SeqIO/InsdcIO.py +++ b/Bio/SeqIO/InsdcIO.py @@ -62,6 +62,8 @@ class GenBankIterator(SequenceIterator): """Parser for GenBank files.""" + modes = "t" + def __init__(self, source): """Break up a Genbank file into SeqRecord objects. @@ -99,7 +101,7 @@ def __init__(self, source): AF297471.1 """ - super().__init__(source, mode="t", fmt="GenBank") + super().__init__(source, fmt="GenBank") self.records = GenBankScanner(debug=0).parse_records(self.stream) def __next__(self): @@ -115,6 +117,8 @@ def __next__(self): class EmblIterator(SequenceIterator): """Parser for EMBL files.""" + modes = "t" + def __init__(self, source): """Break up an EMBL file into SeqRecord objects. @@ -158,7 +162,7 @@ def __init__(self, source): CQ797900.1 """ - super().__init__(source, mode="t", fmt="EMBL") + super().__init__(source, fmt="EMBL") self.records = EmblScanner(debug=0).parse_records(self.stream) def __next__(self): @@ -174,6 +178,8 @@ def __next__(self): class ImgtIterator(SequenceIterator): """Parser for IMGT files.""" + modes = "t" + def __init__(self, source): """Break up an IMGT file into SeqRecord objects. @@ -184,7 +190,7 @@ def __init__(self, source): Note that for genomes or chromosomes, there is typically only one record. """ - super().__init__(source, mode="t", fmt="IMGT") + super().__init__(source, fmt="IMGT") self.records = _ImgtScanner(debug=0).parse_records(self.stream) def __next__(self): @@ -200,6 +206,8 @@ def __next__(self): class GenBankCdsFeatureIterator(SequenceIterator): """Parser for GenBank files, creating a SeqRecord for each CDS feature.""" + modes = "t" + def __init__(self, source): """Break up a Genbank file into SeqRecord objects for each CDS feature. @@ -209,7 +217,7 @@ def __init__(self, source): many CDS features. These are returned as with the stated amino acid translation sequence (if given). """ - super().__init__(source, mode="t", fmt="GenBank") + super().__init__(source, fmt="GenBank") self.records = GenBankScanner(debug=0).parse_cds_features(self.stream) def __next__(self): @@ -225,6 +233,8 @@ def __next__(self): class EmblCdsFeatureIterator(SequenceIterator): """Parser for EMBL files, creating a SeqRecord for each CDS feature.""" + modes = "t" + def __init__(self, source): """Break up a EMBL file into SeqRecord objects for each CDS feature. @@ -234,7 +244,7 @@ def __init__(self, source): many CDS features. These are returned as with the stated amino acid translation sequence (if given). """ - super().__init__(source, mode="t", fmt="EMBL") + super().__init__(source, fmt="EMBL") self.records = EmblScanner(debug=0).parse_cds_features(self.stream) def __next__(self): diff --git a/Bio/SeqIO/Interfaces.py b/Bio/SeqIO/Interfaces.py index 780a7af992b..8af8438b4a4 100644 --- a/Bio/SeqIO/Interfaces.py +++ b/Bio/SeqIO/Interfaces.py @@ -12,6 +12,7 @@ from abc import ABC from abc import abstractmethod +from abc import abstractproperty from os import PathLike from typing import AnyStr from typing import Generic @@ -37,13 +38,25 @@ class SequenceIterator(ABC, Generic[AnyStr]): You should write a __next__ method that returns the next SeqRecord. You may wish to redefine the __init__ method as well. + You must also create a class property `modes` specifying the allowable + file stream modes. """ + @abstractproperty + def modes(self): + """File modes (binary or text) that the parser can handle. + + This property must be "t" (for text mode only), "b" (for binary mode + only), "tb" (if both text and binary mode are accepted, but text mode + is preferred), or "bt" (if both text and binary mode are accepted, but + binary mode is preferred). + """ + pass + def __init__( self, source: _IOSource, alphabet: None = None, - mode: str = "t", fmt: Optional[str] = None, ) -> None: """Create a SequenceIterator object. @@ -51,7 +64,6 @@ def __init__( Arguments: - source - input file stream, or path to input file - alphabet - no longer used, should be None - - mode - string, either "t" for text mode or "b" for binary - fmt - string, mixed case format name for in error messages This method MAY be overridden by any subclass. @@ -63,24 +75,30 @@ def __init__( """ if alphabet is not None: raise ValueError("The alphabet argument is no longer supported") + modes = self.modes if isinstance(source, _PathLikeTypes): + mode = modes[0] self.stream = open(source, "r" + mode) self.should_close_stream = True else: - if mode == "t": - if source.read(0) != "": + value = source.read(0) + if value == "": + if modes == "b": raise StreamModeError( - f"{fmt} files must be opened in text mode." + f"{fmt} files must be opened in binary mode." ) from None - elif mode == "b": - if source.read(0) != b"": + mode = "t" + elif value == b"": + if modes == "t": raise StreamModeError( - f"{fmt} files must be opened in binary mode." + f"{fmt} files must be opened in text mode." ) from None + mode = "b" else: - raise ValueError(f"Unknown mode '{mode}'") from None + raise RuntimeError("Failed to read from input data") from None self.stream = source self.should_close_stream = False + self.mode = mode @abstractmethod def __next__(self): diff --git a/Bio/SeqIO/NibIO.py b/Bio/SeqIO/NibIO.py index 9569bef30dc..c5b81c4f6ba 100644 --- a/Bio/SeqIO/NibIO.py +++ b/Bio/SeqIO/NibIO.py @@ -53,6 +53,8 @@ class NibIterator(SequenceIterator): """Parser for nib files.""" + modes = "b" + def __init__(self, source): """Iterate over a nib file and yield a SeqRecord. @@ -79,7 +81,7 @@ def __init__(self, source): nAGAAGagccgcNGgCActtGAnTAtCGTCgcCacCaGncGncTtGNtGG 50 """ - super().__init__(source, mode="b", fmt="Nib") + super().__init__(source, fmt="Nib") word = self.stream.read(4) if not word: raise ValueError("Empty file.") diff --git a/Bio/SeqIO/PdbIO.py b/Bio/SeqIO/PdbIO.py index 1d81e44b4b0..40bafea0184 100644 --- a/Bio/SeqIO/PdbIO.py +++ b/Bio/SeqIO/PdbIO.py @@ -114,6 +114,8 @@ def AtomIterator(pdb_id, structure): class PdbSeqresIterator(SequenceIterator): """Parser for PDB files.""" + modes = "t" + def __init__(self, source: _TextIOSource) -> None: """Iterate over chains in a PDB file as SeqRecord objects. @@ -151,7 +153,7 @@ def __init__(self, source: _TextIOSource) -> None: Note the chain is recorded in the annotations dictionary, and any PDB DBREF lines are recorded in the database cross-references list. """ - super().__init__(source, mode="t", fmt="PDB") + super().__init__(source, fmt="PDB") self.cache = None def __next__(self): @@ -276,6 +278,8 @@ def __next__(self): class PdbAtomIterator(SequenceIterator): """Parser for structures in a PDB files.""" + modes = "t" + def __init__(self, source: _TextIOSource) -> None: """Iterate over structures in a PDB file as SeqRecord objects. @@ -373,6 +377,8 @@ def __next__(self): class CifSeqresIterator(SequenceIterator): """Parser for chains in an mmCIF files.""" + modes = "t" + def __init__(self, source: _TextIOSource) -> None: """Iterate over chains in an mmCIF file as SeqRecord objects. @@ -498,6 +504,8 @@ def __next__(self): class CifAtomIterator(SequenceIterator): """Parser for structures in an mmCIF files.""" + modes = "t" + def __init__(self, source: _TextIOSource) -> None: """Iterate over structures in an mmCIF file as SeqRecord objects. diff --git a/Bio/SeqIO/PhdIO.py b/Bio/SeqIO/PhdIO.py index 42ee6771443..8c3b07e342a 100644 --- a/Bio/SeqIO/PhdIO.py +++ b/Bio/SeqIO/PhdIO.py @@ -67,6 +67,8 @@ class PhdIterator(SequenceIterator): """Parser for PHD files.""" + modes = "t" + def __init__(self, source: _TextIOSource) -> None: """Return SeqRecord objects from a PHD file. @@ -75,7 +77,7 @@ def __init__(self, source: _TextIOSource) -> None: This uses the Bio.Sequencing.Phd module to do the hard work. """ - super().__init__(source, mode="t", fmt="PHD") + super().__init__(source, fmt="PHD") def __next__(self): phd_record = Phd._read(self.stream) diff --git a/Bio/SeqIO/PirIO.py b/Bio/SeqIO/PirIO.py index 17d5b9a7a2e..51bce08ef6d 100644 --- a/Bio/SeqIO/PirIO.py +++ b/Bio/SeqIO/PirIO.py @@ -110,6 +110,8 @@ class PirIterator(SequenceIterator): """Parser for PIR files.""" + modes = "t" + def __init__(self, source): """Iterate over a PIR file and yield SeqRecord objects. @@ -128,7 +130,7 @@ def __init__(self, source): HLA:HLA01083 length 188 """ - super().__init__(source, mode="t", fmt="Pir") + super().__init__(source, fmt="Pir") # Skip any text before the first record (e.g. blank lines, comments) for line in self.stream: if line[0] == ">": diff --git a/Bio/SeqIO/QualityIO.py b/Bio/SeqIO/QualityIO.py index c1d0511eaf5..33598606115 100644 --- a/Bio/SeqIO/QualityIO.py +++ b/Bio/SeqIO/QualityIO.py @@ -996,6 +996,8 @@ def FastqGeneralIterator(source: _TextIOSource) -> Iterator[tuple[str, str, str] class FastqIteratorAbstractBaseClass(SequenceIterator[str]): """Abstract base class for FASTQ file parsers.""" + modes = "t" + @abstractproperty def q_mapping(self): """Dictionary that maps letters in the quality string to quality values.""" @@ -1015,7 +1017,7 @@ def __init__(self, source): The quality values are stored in the `letter_annotations` dictionary attribute under the key `q_key`. """ - super().__init__(source, mode="t", fmt="Fastq") + super().__init__(source, fmt="Fastq") self.line = None def __next__(self) -> SeqRecord: @@ -1421,6 +1423,8 @@ def __init__( class QualPhredIterator(SequenceIterator): """Parser for QUAL files with PHRED quality scores but no sequence.""" + modes = "t" + def __init__( self, source: _TextIOSource, @@ -1481,7 +1485,7 @@ def __init__( """ if alphabet is not None: raise ValueError("The alphabet argument is no longer supported") - super().__init__(source, mode="t", fmt="QUAL") + super().__init__(source, fmt="QUAL") # Skip any text before the first record (e.g. blank lines, comments) for line in self.stream: if line[0] == ">": diff --git a/Bio/SeqIO/SeqXmlIO.py b/Bio/SeqIO/SeqXmlIO.py index 52c05274b89..437f9948f9f 100644 --- a/Bio/SeqIO/SeqXmlIO.py +++ b/Bio/SeqIO/SeqXmlIO.py @@ -441,6 +441,8 @@ class SeqXmlIterator(SequenceIterator): method calls. """ + modes = "b" + # Small block size can be a problem with libexpat 2.6.0 onwards: BLOCK = 1024 @@ -451,7 +453,7 @@ def __init__(self, stream_or_path, namespace=None): # if the text handle was opened with a different encoding than the # one specified in the XML file. With a binary handle, the correct # encoding is picked up by the parser from the XML file. - super().__init__(stream_or_path, mode="b", fmt="SeqXML") + super().__init__(stream_or_path, fmt="SeqXML") stream = self.stream parser = sax.make_parser() content_handler = ContentHandler() diff --git a/Bio/SeqIO/SffIO.py b/Bio/SeqIO/SffIO.py index 2b39da78557..69d236b6b29 100644 --- a/Bio/SeqIO/SffIO.py +++ b/Bio/SeqIO/SffIO.py @@ -749,6 +749,8 @@ def _sff_read_raw_record(handle, number_of_flows_per_read): class SffIterator(SequenceIterator): """Parser for Standard Flowgram Format (SFF) files.""" + modes = "b" + # the read header format (fixed part): # read_header_length H # name_length H @@ -829,7 +831,7 @@ def __init__(self, source, alphabet=None, trim=False): """ if alphabet is not None: raise ValueError("The alphabet argument is no longer supported") - super().__init__(source, mode="b", fmt="SFF") + super().__init__(source, fmt="SFF") self.trim = trim stream = self.stream ( diff --git a/Bio/SeqIO/SnapGeneIO.py b/Bio/SeqIO/SnapGeneIO.py index 31611957224..f359e5e39d6 100644 --- a/Bio/SeqIO/SnapGeneIO.py +++ b/Bio/SeqIO/SnapGeneIO.py @@ -291,6 +291,8 @@ def _get_child_value(node, name, default=None, error=None): class SnapGeneIterator(SequenceIterator): """Parser for SnapGene files.""" + modes = "b" + def __init__(self, source): """Parse a SnapGene file and return a SeqRecord object. @@ -299,7 +301,7 @@ def __init__(self, source): Note that a SnapGene file can only contain one sequence, so this iterator will always return a single record. """ - super().__init__(source, mode="b", fmt="SnapGene") + super().__init__(source, fmt="SnapGene") self.packets = _iterate(self.stream) try: packet_type, length, data = next(self.packets) diff --git a/Bio/SeqIO/SwissIO.py b/Bio/SeqIO/SwissIO.py index 98205065d38..b432b70dddc 100644 --- a/Bio/SeqIO/SwissIO.py +++ b/Bio/SeqIO/SwissIO.py @@ -27,6 +27,8 @@ class SwissIterator(SequenceIterator): """Parser to break up a Swiss-Prot/UniProt file into SeqRecord objects.""" + modes = "t" + def __init__(self, source: _TextIOSource) -> None: """Iterate over a Swiss-Prot file and return SeqRecord objects. @@ -47,7 +49,7 @@ def __init__(self, source: _TextIOSource) -> None: Rather than calling it directly, you are expected to use this parser via Bio.SeqIO.parse(..., format="swiss") instead. """ - super().__init__(source, mode="t", fmt="SwissProt") + super().__init__(source, fmt="SwissProt") def __next__(self): swiss_record = SwissProt._read(self.stream) diff --git a/Bio/SeqIO/TabIO.py b/Bio/SeqIO/TabIO.py index 9e1efb04599..1732b2b55e5 100644 --- a/Bio/SeqIO/TabIO.py +++ b/Bio/SeqIO/TabIO.py @@ -44,6 +44,8 @@ class TabIterator(SequenceIterator): """Parser for tab-delimited files.""" + modes = "t" + def __init__(self, source): """Iterate over tab separated lines as SeqRecord objects. @@ -75,7 +77,7 @@ def __init__(self, source): gi|45478721|ref|NP_995576.1| length 90 """ - super().__init__(source, mode="t", fmt="Tab-separated plain-text") + super().__init__(source, fmt="Tab-separated plain-text") def __next__(self): for line in self.stream: diff --git a/Bio/SeqIO/TwoBitIO.py b/Bio/SeqIO/TwoBitIO.py index 69a2e164059..2b507fd2623 100644 --- a/Bio/SeqIO/TwoBitIO.py +++ b/Bio/SeqIO/TwoBitIO.py @@ -170,9 +170,11 @@ def lower(self): class TwoBitIterator(SequenceIterator): """Parser for UCSC twoBit (.2bit) files.""" + modes = "b" + def __init__(self, source): """Read the file index.""" - super().__init__(source, mode="b", fmt="twoBit") + super().__init__(source, fmt="twoBit") # wait to close the file until the TwoBitIterator goes out of scope: self.should_close_stream = False stream = self.stream diff --git a/Bio/SeqIO/UniprotIO.py b/Bio/SeqIO/UniprotIO.py index 9d5e4b727c8..2f635fe1c41 100644 --- a/Bio/SeqIO/UniprotIO.py +++ b/Bio/SeqIO/UniprotIO.py @@ -18,11 +18,14 @@ from xml.etree import ElementTree from xml.parsers.expat import errors +import warnings from Bio import SeqFeature from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord +from Bio import BiopythonDeprecationWarning + from .Interfaces import _BytesIOSource from .Interfaces import SequenceIterator @@ -34,6 +37,8 @@ class UniprotIterator(SequenceIterator): """Parser for UniProt XML files, returning SeqRecord objects.""" + modes = "bt" + def __init__( self, source: _BytesIOSource, @@ -55,7 +60,17 @@ def __init__( """ if alphabet is not None: raise ValueError("The alphabet argument is no longer supported") - super().__init__(source, mode="b", fmt="UniProt XML") + super().__init__(source, fmt="UniProt XML") + if self.mode == "t": + warnings.warn( + "Opening a UniProt XML file in text mode is " + "deprecated, as it may lead to garbled characters. " + "We recommend opening the file in binary mode; " + "parsing UniProt XML files opened in text mode will " + "no longer be supported in a future release of " + "Biopython.", + BiopythonDeprecationWarning, + ) self.return_raw_comments = return_raw_comments self._data = ElementTree.iterparse( self.stream, events=("start", "start-ns", "end") diff --git a/Bio/SeqIO/XdnaIO.py b/Bio/SeqIO/XdnaIO.py index e75b8174c67..754826c77c1 100644 --- a/Bio/SeqIO/XdnaIO.py +++ b/Bio/SeqIO/XdnaIO.py @@ -145,6 +145,8 @@ def _read_feature(handle, record): class XdnaIterator(SequenceIterator): """Parser for Xdna files.""" + modes = "b" + def __init__(self, source): """Parse a Xdna file and return a SeqRecord object. @@ -154,7 +156,7 @@ def __init__(self, source): contain a single sequence. """ - super().__init__(source, mode="b", fmt="Xdna") + super().__init__(source, fmt="Xdna") header = self.stream.read(112) if not header: raise ValueError("Empty file.") diff --git a/DEPRECATED.rst b/DEPRECATED.rst index d92ab3758b1..8f0a595f575 100644 --- a/DEPRECATED.rst +++ b/DEPRECATED.rst @@ -75,6 +75,14 @@ Another option is to use ``format='fasta-blast'``; this follows the FASTA file format accepted by BLAST, treating any lines starting with '#', ';', or '!' as comment lines and ignoring them. +Bio.SeqIO.UniprotIO +------------------- +Parsing a UniProt XML file opened in text mode (if the file was opened using +``open("myuniprotfile.xml")``) was deprecated in Release 1.85, as this may lead +to garbled characters. Please open the file in binary mode (as in +``open("myuniprotfile.xml", "rb")``), or let ``Bio.SeqIO.parse`` take care of +opening and closing files by passing the file name instead of a file handle. + Bio.Entrez ---------- The ``egquery`` function wrapping the NCBI EGQuery (Entrez Global Query) diff --git a/Tests/test_SeqIO_UniprotIO.py b/Tests/test_SeqIO_UniprotIO.py index 8ce5a9609a4..c028c0c4280 100644 --- a/Tests/test_SeqIO_UniprotIO.py +++ b/Tests/test_SeqIO_UniprotIO.py @@ -13,18 +13,20 @@ from Bio import SeqIO from Bio.SeqRecord import SeqRecord +from Bio import BiopythonDeprecationWarning + class ParserTests(SeqRecordTestBaseClass): """Tests Uniprot XML parser.""" - def test_uni001(self): + def check_uni001(self, mode): """Parsing Uniprot file uni001.""" filename = "uni001" # test the record parser datafile = os.path.join("SwissProt", filename) - with open(datafile, "rb") as handle: + with open(datafile, mode) as handle: seq_record = SeqIO.read(handle, "uniprot-xml") self.assertIsInstance(seq_record, SeqRecord) @@ -133,6 +135,12 @@ def test_uni001(self): self.assertEqual(seq_record.annotations["sequence_version"], 1) self.assertEqual(seq_record.annotations["proteinExistence"], ["Predicted"]) + def test_uni001(self): + """Parsing Uniprot file uni001 in text mode and in binary mode.""" + self.check_uni001("rb") + with self.assertWarns(BiopythonDeprecationWarning): + self.check_uni001("rt") + def test_uni003(self): """Parsing Uniprot file uni003.""" filename = "uni003"