Allow parsers in Bio.SeqIO to handle both text and binary modes (biop…

…ython#4842)
mdehoon · Sep 18, 2024 · ea8ff47 · ea8ff47
1 parent 296e815
commit ea8ff47
Show file tree

Hide file tree

Showing 23 changed files with 152 additions and 41 deletions.
diff --git a/Bio/SeqIO/AbiIO.py b/Bio/SeqIO/AbiIO.py
@@ -347,9 +347,11 @@ def _get_string_tag(opt_bytes_value, default=None):
 class AbiIterator(SequenceIterator):
     """Parser for Abi files."""
 
+    modes = "b"
+
     def __init__(self, source, trim=False):
         """Return an iterator for the Abi file format."""
-        super().__init__(source, mode="b", fmt="ABI")
+        super().__init__(source, fmt="ABI")
         # check if input file is a valid Abi file
         marker = self.stream.read(4)
         if not marker:

diff --git a/Bio/SeqIO/AceIO.py b/Bio/SeqIO/AceIO.py
@@ -22,6 +22,8 @@
 class AceIterator(SequenceIterator):
     """Return SeqRecord objects from an ACE file."""
 
+    modes = "t"
+
     def __init__(
         self,
         source: _TextIOSource,
@@ -69,8 +71,8 @@ def __init__(
         90
 
         """
-        super().__init__(source, mode="t", fmt="ACE")
-        self.ace_contigs = Ace._parse(self.stream)
+        super().__init__(source, fmt="ACE")
+        self.ace_contigs = Ace.parse(self.stream)
 
     def __next__(self):
         try:

diff --git a/Bio/SeqIO/FastaIO.py b/Bio/SeqIO/FastaIO.py
@@ -143,6 +143,8 @@ def FastaTwoLineParser(handle):
 class FastaIterator(SequenceIterator):
     """Parser for plain Fasta files without comments."""
 
+    modes = "t"
+
     def __init__(
         self,
         source: _TextIOSource,
@@ -191,7 +193,7 @@ def __init__(
         """
         if alphabet is not None:
             raise ValueError("The alphabet argument is no longer supported")
-        super().__init__(source, mode="t", fmt="Fasta")
+        super().__init__(source, fmt="Fasta")
         try:
             line = next(self.stream)
         except StopIteration:
@@ -266,6 +268,8 @@ def __next__(self):
 class FastaTwoLineIterator(SequenceIterator):
     """Parser for Fasta files with exactly two lines per record."""
 
+    modes = "t"
+
     def __init__(self, source):
         """Iterate over two-line Fasta records (as SeqRecord objects).
 
@@ -278,7 +282,7 @@ def __init__(self, source):
         Only the default title to ID/name/description parsing offered
         by the relaxed FASTA parser is offered.
         """
-        super().__init__(source, mode="t", fmt="FASTA")
+        super().__init__(source, fmt="FASTA")
         self._data = FastaTwoLineParser(self.stream)
 
     def __next__(self):
@@ -300,6 +304,8 @@ def __next__(self):
 class FastaBlastIterator(SequenceIterator):
     """Parser for Fasta files, allowing for comments as in BLAST."""
 
+    modes = "t"
+
     def __init__(
         self,
         source: _TextIOSource,
@@ -348,7 +354,7 @@ def __init__(
         """
         if alphabet is not None:
             raise ValueError("The alphabet argument is no longer supported")
-        super().__init__(source, mode="t", fmt="FASTA")
+        super().__init__(source, fmt="FASTA")
         for line in self.stream:
             if line[0] not in "#!;":
                 if not line.startswith(">"):
@@ -397,6 +403,8 @@ def __next__(self):
 class FastaPearsonIterator(SequenceIterator):
     """Parser for Fasta files, allowing for comments as in the FASTA aligner."""
 
+    modes = "t"
+
     def __init__(
         self,
         source: _TextIOSource,
@@ -446,7 +454,7 @@ def __init__(
         """
         if alphabet is not None:
             raise ValueError("The alphabet argument is no longer supported")
-        super().__init__(source, mode="t", fmt="Fasta")
+        super().__init__(source, fmt="Fasta")
         for line in self.stream:
             if line.startswith(">"):
                 self._line = line

diff --git a/Bio/SeqIO/GckIO.py b/Bio/SeqIO/GckIO.py
@@ -73,13 +73,15 @@ def _read_p4string(stream):
 class GckIterator(SequenceIterator):
     """Parser for GCK files."""
 
+    modes = "b"
+
     def __init__(self, source):
         """Break up a GCK file into SeqRecord objects.
 
         Note that a GCK file can only contain one sequence, so this
         iterator will always return a single record.
         """
-        super().__init__(source, mode="b", fmt="GCK")
+        super().__init__(source, fmt="GCK")
         # Skip file header
         # GCK files start with a 24-bytes header. Bytes 4 and 8 seem to
         # always be 12, maybe this could act as a magic cookie. Bytes

diff --git a/Bio/SeqIO/GfaIO.py b/Bio/SeqIO/GfaIO.py
@@ -119,6 +119,8 @@ class Gfa1Iterator(SequenceIterator):
     Documentation: https://gfa-spec.github.io/GFA-spec/GFA1.html
     """
 
+    modes = "t"
+
     def __init__(
         self,
         source: _TextIOSource,
@@ -128,7 +130,7 @@ def __init__(
         Arguments:
          - source - input stream opened in text mode, or a path to a file
         """
-        super().__init__(source, mode="t", fmt="GFA 1.0")
+        super().__init__(source, fmt="GFA 1.0")
 
     def __next__(self):
         for line in self.stream:
@@ -164,6 +166,8 @@ class Gfa2Iterator(SequenceIterator):
     Documentation for version 2: https://gfa-spec.github.io/GFA-spec/GFA2.html
     """
 
+    modes = "t"
+
     def __init__(
         self,
         source: _TextIOSource,
@@ -173,7 +177,7 @@ def __init__(
         Arguments:
          - source - input stream opened in text mode, or a path to a file
         """
-        super().__init__(source, mode="t", fmt="GFA 2.0")
+        super().__init__(source, fmt="GFA 2.0")
 
     def __next__(self):
         for line in self.stream:

diff --git a/Bio/SeqIO/IgIO.py b/Bio/SeqIO/IgIO.py
@@ -22,6 +22,8 @@
 class IgIterator(SequenceIterator):
     """Parser for IntelliGenetics files."""
 
+    modes = "t"
+
     def __init__(self, source):
         """Iterate over IntelliGenetics records (as SeqRecord objects).
 
@@ -60,7 +62,7 @@ def __init__(self, source):
         SYK_SYK length 330
 
         """
-        super().__init__(source, mode="t", fmt="IntelliGenetics")
+        super().__init__(source, fmt="IntelliGenetics")
         for line in self.stream:
             if not line.startswith(";;"):
                 break

diff --git a/Bio/SeqIO/InsdcIO.py b/Bio/SeqIO/InsdcIO.py
@@ -62,6 +62,8 @@
 class GenBankIterator(SequenceIterator):
     """Parser for GenBank files."""
 
+    modes = "t"
+
     def __init__(self, source):
         """Break up a Genbank file into SeqRecord objects.
 
@@ -99,7 +101,7 @@ def __init__(self, source):
         AF297471.1
 
         """
-        super().__init__(source, mode="t", fmt="GenBank")
+        super().__init__(source, fmt="GenBank")
         self.records = GenBankScanner(debug=0).parse_records(self.stream)
 
     def __next__(self):
@@ -115,6 +117,8 @@ def __next__(self):
 class EmblIterator(SequenceIterator):
     """Parser for EMBL files."""
 
+    modes = "t"
+
     def __init__(self, source):
         """Break up an EMBL file into SeqRecord objects.
 
@@ -158,7 +162,7 @@ def __init__(self, source):
         CQ797900.1
 
         """
-        super().__init__(source, mode="t", fmt="EMBL")
+        super().__init__(source, fmt="EMBL")
         self.records = EmblScanner(debug=0).parse_records(self.stream)
 
     def __next__(self):
@@ -174,6 +178,8 @@ def __next__(self):
 class ImgtIterator(SequenceIterator):
     """Parser for IMGT files."""
 
+    modes = "t"
+
     def __init__(self, source):
         """Break up an IMGT file into SeqRecord objects.
 
@@ -184,7 +190,7 @@ def __init__(self, source):
         Note that for genomes or chromosomes, there is typically only
         one record.
         """
-        super().__init__(source, mode="t", fmt="IMGT")
+        super().__init__(source, fmt="IMGT")
         self.records = _ImgtScanner(debug=0).parse_records(self.stream)
 
     def __next__(self):
@@ -200,6 +206,8 @@ def __next__(self):
 class GenBankCdsFeatureIterator(SequenceIterator):
     """Parser for GenBank files, creating a SeqRecord for each CDS feature."""
 
+    modes = "t"
+
     def __init__(self, source):
         """Break up a Genbank file into SeqRecord objects for each CDS feature.
 
@@ -209,7 +217,7 @@ def __init__(self, source):
         many CDS features.  These are returned as with the stated amino acid
         translation sequence (if given).
         """
-        super().__init__(source, mode="t", fmt="GenBank")
+        super().__init__(source, fmt="GenBank")
         self.records = GenBankScanner(debug=0).parse_cds_features(self.stream)
 
     def __next__(self):
@@ -225,6 +233,8 @@ def __next__(self):
 class EmblCdsFeatureIterator(SequenceIterator):
     """Parser for EMBL files, creating a SeqRecord for each CDS feature."""
 
+    modes = "t"
+
     def __init__(self, source):
         """Break up a EMBL file into SeqRecord objects for each CDS feature.
 
@@ -234,7 +244,7 @@ def __init__(self, source):
         many CDS features.  These are returned as with the stated amino acid
         translation sequence (if given).
         """
-        super().__init__(source, mode="t", fmt="EMBL")
+        super().__init__(source, fmt="EMBL")
         self.records = EmblScanner(debug=0).parse_cds_features(self.stream)
 
     def __next__(self):

diff --git a/Bio/SeqIO/Interfaces.py b/Bio/SeqIO/Interfaces.py
@@ -12,6 +12,7 @@
 
 from abc import ABC
 from abc import abstractmethod
+from abc import abstractproperty
 from os import PathLike
 from typing import AnyStr
 from typing import Generic
@@ -37,21 +38,32 @@ class SequenceIterator(ABC, Generic[AnyStr]):
 
     You should write a __next__ method that returns the next SeqRecord.  You
     may wish to redefine the __init__ method as well.
+    You must also create a class property `modes` specifying the allowable
+    file stream modes.
     """
 
+    @abstractproperty
+    def modes(self):
+        """File modes (binary or text) that the parser can handle.
+
+        This property must be "t" (for text mode only), "b" (for binary mode
+        only), "tb" (if both text and binary mode are accepted, but text mode
+        is preferred), or "bt" (if both text and binary mode are accepted, but
+        binary mode is preferred).
+        """
+        pass
+
     def __init__(
         self,
         source: _IOSource,
         alphabet: None = None,
-        mode: str = "t",
         fmt: Optional[str] = None,
     ) -> None:
         """Create a SequenceIterator object.
 
         Arguments:
         - source - input file stream, or path to input file
         - alphabet - no longer used, should be None
-        - mode - string, either "t" for text mode or "b" for binary
         - fmt - string, mixed case format name for in error messages
 
         This method MAY be overridden by any subclass.
@@ -63,24 +75,30 @@ def __init__(
         """
         if alphabet is not None:
             raise ValueError("The alphabet argument is no longer supported")
+        modes = self.modes
         if isinstance(source, _PathLikeTypes):
+            mode = modes[0]
             self.stream = open(source, "r" + mode)
             self.should_close_stream = True
         else:
-            if mode == "t":
-                if source.read(0) != "":
+            value = source.read(0)
+            if value == "":
+                if modes == "b":
                     raise StreamModeError(
-                        f"{fmt} files must be opened in text mode."
+                        f"{fmt} files must be opened in binary mode."
                     ) from None
-            elif mode == "b":
-                if source.read(0) != b"":
+                mode = "t"
+            elif value == b"":
+                if modes == "t":
                     raise StreamModeError(
-                        f"{fmt} files must be opened in binary mode."
+                        f"{fmt} files must be opened in text mode."
                     ) from None
+                mode = "b"
             else:
-                raise ValueError(f"Unknown mode '{mode}'") from None
+                raise RuntimeError("Failed to read from input data") from None
             self.stream = source
             self.should_close_stream = False
+        self.mode = mode
 
     @abstractmethod
     def __next__(self):

diff --git a/Bio/SeqIO/NibIO.py b/Bio/SeqIO/NibIO.py
@@ -53,6 +53,8 @@
 class NibIterator(SequenceIterator):
     """Parser for nib files."""
 
+    modes = "b"
+
     def __init__(self, source):
         """Iterate over a nib file and yield a SeqRecord.
 
@@ -79,7 +81,7 @@ def __init__(self, source):
         nAGAAGagccgcNGgCActtGAnTAtCGTCgcCacCaGncGncTtGNtGG 50
 
         """
-        super().__init__(source, mode="b", fmt="Nib")
+        super().__init__(source, fmt="Nib")
         word = self.stream.read(4)
         if not word:
             raise ValueError("Empty file.")