Skip to content

Commit

Permalink
Allow parsers in Bio.SeqIO to handle both text and binary modes (biop…
Browse files Browse the repository at this point in the history
  • Loading branch information
mdehoon authored Sep 18, 2024
1 parent 296e815 commit ea8ff47
Show file tree
Hide file tree
Showing 23 changed files with 152 additions and 41 deletions.
4 changes: 3 additions & 1 deletion Bio/SeqIO/AbiIO.py
Original file line number Diff line number Diff line change
Expand Up @@ -347,9 +347,11 @@ def _get_string_tag(opt_bytes_value, default=None):
class AbiIterator(SequenceIterator):
"""Parser for Abi files."""

modes = "b"

def __init__(self, source, trim=False):
"""Return an iterator for the Abi file format."""
super().__init__(source, mode="b", fmt="ABI")
super().__init__(source, fmt="ABI")
# check if input file is a valid Abi file
marker = self.stream.read(4)
if not marker:
Expand Down
6 changes: 4 additions & 2 deletions Bio/SeqIO/AceIO.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
class AceIterator(SequenceIterator):
"""Return SeqRecord objects from an ACE file."""

modes = "t"

def __init__(
self,
source: _TextIOSource,
Expand Down Expand Up @@ -69,8 +71,8 @@ def __init__(
90
"""
super().__init__(source, mode="t", fmt="ACE")
self.ace_contigs = Ace._parse(self.stream)
super().__init__(source, fmt="ACE")
self.ace_contigs = Ace.parse(self.stream)

def __next__(self):
try:
Expand Down
16 changes: 12 additions & 4 deletions Bio/SeqIO/FastaIO.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,8 @@ def FastaTwoLineParser(handle):
class FastaIterator(SequenceIterator):
"""Parser for plain Fasta files without comments."""

modes = "t"

def __init__(
self,
source: _TextIOSource,
Expand Down Expand Up @@ -191,7 +193,7 @@ def __init__(
"""
if alphabet is not None:
raise ValueError("The alphabet argument is no longer supported")
super().__init__(source, mode="t", fmt="Fasta")
super().__init__(source, fmt="Fasta")
try:
line = next(self.stream)
except StopIteration:
Expand Down Expand Up @@ -266,6 +268,8 @@ def __next__(self):
class FastaTwoLineIterator(SequenceIterator):
"""Parser for Fasta files with exactly two lines per record."""

modes = "t"

def __init__(self, source):
"""Iterate over two-line Fasta records (as SeqRecord objects).
Expand All @@ -278,7 +282,7 @@ def __init__(self, source):
Only the default title to ID/name/description parsing offered
by the relaxed FASTA parser is offered.
"""
super().__init__(source, mode="t", fmt="FASTA")
super().__init__(source, fmt="FASTA")
self._data = FastaTwoLineParser(self.stream)

def __next__(self):
Expand All @@ -300,6 +304,8 @@ def __next__(self):
class FastaBlastIterator(SequenceIterator):
"""Parser for Fasta files, allowing for comments as in BLAST."""

modes = "t"

def __init__(
self,
source: _TextIOSource,
Expand Down Expand Up @@ -348,7 +354,7 @@ def __init__(
"""
if alphabet is not None:
raise ValueError("The alphabet argument is no longer supported")
super().__init__(source, mode="t", fmt="FASTA")
super().__init__(source, fmt="FASTA")
for line in self.stream:
if line[0] not in "#!;":
if not line.startswith(">"):
Expand Down Expand Up @@ -397,6 +403,8 @@ def __next__(self):
class FastaPearsonIterator(SequenceIterator):
"""Parser for Fasta files, allowing for comments as in the FASTA aligner."""

modes = "t"

def __init__(
self,
source: _TextIOSource,
Expand Down Expand Up @@ -446,7 +454,7 @@ def __init__(
"""
if alphabet is not None:
raise ValueError("The alphabet argument is no longer supported")
super().__init__(source, mode="t", fmt="Fasta")
super().__init__(source, fmt="Fasta")
for line in self.stream:
if line.startswith(">"):
self._line = line
Expand Down
4 changes: 3 additions & 1 deletion Bio/SeqIO/GckIO.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,13 +73,15 @@ def _read_p4string(stream):
class GckIterator(SequenceIterator):
"""Parser for GCK files."""

modes = "b"

def __init__(self, source):
"""Break up a GCK file into SeqRecord objects.
Note that a GCK file can only contain one sequence, so this
iterator will always return a single record.
"""
super().__init__(source, mode="b", fmt="GCK")
super().__init__(source, fmt="GCK")
# Skip file header
# GCK files start with a 24-bytes header. Bytes 4 and 8 seem to
# always be 12, maybe this could act as a magic cookie. Bytes
Expand Down
8 changes: 6 additions & 2 deletions Bio/SeqIO/GfaIO.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,8 @@ class Gfa1Iterator(SequenceIterator):
Documentation: https://gfa-spec.github.io/GFA-spec/GFA1.html
"""

modes = "t"

def __init__(
self,
source: _TextIOSource,
Expand All @@ -128,7 +130,7 @@ def __init__(
Arguments:
- source - input stream opened in text mode, or a path to a file
"""
super().__init__(source, mode="t", fmt="GFA 1.0")
super().__init__(source, fmt="GFA 1.0")

def __next__(self):
for line in self.stream:
Expand Down Expand Up @@ -164,6 +166,8 @@ class Gfa2Iterator(SequenceIterator):
Documentation for version 2: https://gfa-spec.github.io/GFA-spec/GFA2.html
"""

modes = "t"

def __init__(
self,
source: _TextIOSource,
Expand All @@ -173,7 +177,7 @@ def __init__(
Arguments:
- source - input stream opened in text mode, or a path to a file
"""
super().__init__(source, mode="t", fmt="GFA 2.0")
super().__init__(source, fmt="GFA 2.0")

def __next__(self):
for line in self.stream:
Expand Down
4 changes: 3 additions & 1 deletion Bio/SeqIO/IgIO.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
class IgIterator(SequenceIterator):
"""Parser for IntelliGenetics files."""

modes = "t"

def __init__(self, source):
"""Iterate over IntelliGenetics records (as SeqRecord objects).
Expand Down Expand Up @@ -60,7 +62,7 @@ def __init__(self, source):
SYK_SYK length 330
"""
super().__init__(source, mode="t", fmt="IntelliGenetics")
super().__init__(source, fmt="IntelliGenetics")
for line in self.stream:
if not line.startswith(";;"):
break
Expand Down
20 changes: 15 additions & 5 deletions Bio/SeqIO/InsdcIO.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@
class GenBankIterator(SequenceIterator):
"""Parser for GenBank files."""

modes = "t"

def __init__(self, source):
"""Break up a Genbank file into SeqRecord objects.
Expand Down Expand Up @@ -99,7 +101,7 @@ def __init__(self, source):
AF297471.1
"""
super().__init__(source, mode="t", fmt="GenBank")
super().__init__(source, fmt="GenBank")
self.records = GenBankScanner(debug=0).parse_records(self.stream)

def __next__(self):
Expand All @@ -115,6 +117,8 @@ def __next__(self):
class EmblIterator(SequenceIterator):
"""Parser for EMBL files."""

modes = "t"

def __init__(self, source):
"""Break up an EMBL file into SeqRecord objects.
Expand Down Expand Up @@ -158,7 +162,7 @@ def __init__(self, source):
CQ797900.1
"""
super().__init__(source, mode="t", fmt="EMBL")
super().__init__(source, fmt="EMBL")
self.records = EmblScanner(debug=0).parse_records(self.stream)

def __next__(self):
Expand All @@ -174,6 +178,8 @@ def __next__(self):
class ImgtIterator(SequenceIterator):
"""Parser for IMGT files."""

modes = "t"

def __init__(self, source):
"""Break up an IMGT file into SeqRecord objects.
Expand All @@ -184,7 +190,7 @@ def __init__(self, source):
Note that for genomes or chromosomes, there is typically only
one record.
"""
super().__init__(source, mode="t", fmt="IMGT")
super().__init__(source, fmt="IMGT")
self.records = _ImgtScanner(debug=0).parse_records(self.stream)

def __next__(self):
Expand All @@ -200,6 +206,8 @@ def __next__(self):
class GenBankCdsFeatureIterator(SequenceIterator):
"""Parser for GenBank files, creating a SeqRecord for each CDS feature."""

modes = "t"

def __init__(self, source):
"""Break up a Genbank file into SeqRecord objects for each CDS feature.
Expand All @@ -209,7 +217,7 @@ def __init__(self, source):
many CDS features. These are returned as with the stated amino acid
translation sequence (if given).
"""
super().__init__(source, mode="t", fmt="GenBank")
super().__init__(source, fmt="GenBank")
self.records = GenBankScanner(debug=0).parse_cds_features(self.stream)

def __next__(self):
Expand All @@ -225,6 +233,8 @@ def __next__(self):
class EmblCdsFeatureIterator(SequenceIterator):
"""Parser for EMBL files, creating a SeqRecord for each CDS feature."""

modes = "t"

def __init__(self, source):
"""Break up a EMBL file into SeqRecord objects for each CDS feature.
Expand All @@ -234,7 +244,7 @@ def __init__(self, source):
many CDS features. These are returned as with the stated amino acid
translation sequence (if given).
"""
super().__init__(source, mode="t", fmt="EMBL")
super().__init__(source, fmt="EMBL")
self.records = EmblScanner(debug=0).parse_cds_features(self.stream)

def __next__(self):
Expand Down
36 changes: 27 additions & 9 deletions Bio/SeqIO/Interfaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

from abc import ABC
from abc import abstractmethod
from abc import abstractproperty
from os import PathLike
from typing import AnyStr
from typing import Generic
Expand All @@ -37,21 +38,32 @@ class SequenceIterator(ABC, Generic[AnyStr]):
You should write a __next__ method that returns the next SeqRecord. You
may wish to redefine the __init__ method as well.
You must also create a class property `modes` specifying the allowable
file stream modes.
"""

@abstractproperty
def modes(self):
"""File modes (binary or text) that the parser can handle.
This property must be "t" (for text mode only), "b" (for binary mode
only), "tb" (if both text and binary mode are accepted, but text mode
is preferred), or "bt" (if both text and binary mode are accepted, but
binary mode is preferred).
"""
pass

def __init__(
self,
source: _IOSource,
alphabet: None = None,
mode: str = "t",
fmt: Optional[str] = None,
) -> None:
"""Create a SequenceIterator object.
Arguments:
- source - input file stream, or path to input file
- alphabet - no longer used, should be None
- mode - string, either "t" for text mode or "b" for binary
- fmt - string, mixed case format name for in error messages
This method MAY be overridden by any subclass.
Expand All @@ -63,24 +75,30 @@ def __init__(
"""
if alphabet is not None:
raise ValueError("The alphabet argument is no longer supported")
modes = self.modes
if isinstance(source, _PathLikeTypes):
mode = modes[0]
self.stream = open(source, "r" + mode)
self.should_close_stream = True
else:
if mode == "t":
if source.read(0) != "":
value = source.read(0)
if value == "":
if modes == "b":
raise StreamModeError(
f"{fmt} files must be opened in text mode."
f"{fmt} files must be opened in binary mode."
) from None
elif mode == "b":
if source.read(0) != b"":
mode = "t"
elif value == b"":
if modes == "t":
raise StreamModeError(
f"{fmt} files must be opened in binary mode."
f"{fmt} files must be opened in text mode."
) from None
mode = "b"
else:
raise ValueError(f"Unknown mode '{mode}'") from None
raise RuntimeError("Failed to read from input data") from None
self.stream = source
self.should_close_stream = False
self.mode = mode

@abstractmethod
def __next__(self):
Expand Down
4 changes: 3 additions & 1 deletion Bio/SeqIO/NibIO.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@
class NibIterator(SequenceIterator):
"""Parser for nib files."""

modes = "b"

def __init__(self, source):
"""Iterate over a nib file and yield a SeqRecord.
Expand All @@ -79,7 +81,7 @@ def __init__(self, source):
nAGAAGagccgcNGgCActtGAnTAtCGTCgcCacCaGncGncTtGNtGG 50
"""
super().__init__(source, mode="b", fmt="Nib")
super().__init__(source, fmt="Nib")
word = self.stream.read(4)
if not word:
raise ValueError("Empty file.")
Expand Down
Loading

0 comments on commit ea8ff47

Please sign in to comment.