From 69b466f0382fa5f407dcbd357e4e8c86fa819494 Mon Sep 17 00:00:00 2001 From: mdehoon Date: Fri, 6 Sep 2024 00:40:55 +0900 Subject: [PATCH] Avoid yield in Bio.SeqIO SffIO (#4813) --- Bio/SeqIO/SffIO.py | 556 +++++++++---------- Bio/SeqIO/_index.py | 49 +- Tests/{test_SffIO.py => test_SeqIO_SffIO.py} | 0 Tests/test_SeqIO_index.py | 20 +- 4 files changed, 299 insertions(+), 326 deletions(-) rename Tests/{test_SffIO.py => test_SeqIO_SffIO.py} (100%) diff --git a/Bio/SeqIO/SffIO.py b/Bio/SeqIO/SffIO.py index a7ffc868a2f..ef53f0f7160 100644 --- a/Bio/SeqIO/SffIO.py +++ b/Bio/SeqIO/SffIO.py @@ -275,7 +275,7 @@ def _sff_file_header(handle): """ # file header (part one) - # use big endiean encdoing > + # use big endian encoding > # magic_number I # version 4B # index_offset Q @@ -643,155 +643,6 @@ def _sff_read_roche_index(handle): _valid_UAN_read_name = re.compile(r"^[a-zA-Z0-9]{14}$") -def _sff_read_seq_record( - handle, number_of_flows_per_read, flow_chars, key_sequence, trim=False -): - """Parse the next read in the file, return data as a SeqRecord (PRIVATE).""" - # Now on to the reads... - # the read header format (fixed part): - # read_header_length H - # name_length H - # seq_len I - # clip_qual_left H - # clip_qual_right H - # clip_adapter_left H - # clip_adapter_right H - # [rest of read header depends on the name length etc] - read_header_fmt = ">2HI4H" - read_header_size = struct.calcsize(read_header_fmt) - read_flow_fmt = ">%iH" % number_of_flows_per_read - read_flow_size = struct.calcsize(read_flow_fmt) - - ( - read_header_length, - name_length, - seq_len, - clip_qual_left, - clip_qual_right, - clip_adapter_left, - clip_adapter_right, - ) = struct.unpack(read_header_fmt, handle.read(read_header_size)) - if clip_qual_left: - clip_qual_left -= 1 # python counting - if clip_adapter_left: - clip_adapter_left -= 1 # python counting - if read_header_length < 10 or read_header_length % 8 != 0: - raise ValueError( - "Malformed read header, says length is %i" % read_header_length - ) - # now the name and any padding (remainder of header) - name = handle.read(name_length).decode() - padding = read_header_length - read_header_size - name_length - if handle.read(padding).count(_null) != padding: - import warnings - - from Bio import BiopythonParserWarning - - warnings.warn( - "Your SFF file is invalid, post name %i " - "byte padding region contained data" % padding, - BiopythonParserWarning, - ) - # now the flowgram values, flowgram index, bases and qualities - # NOTE - assuming flowgram_format==1, which means struct type H - flow_values = handle.read(read_flow_size) # unpack later if needed - temp_fmt = ">%iB" % seq_len # used for flow index and quals - flow_index = handle.read(seq_len) # unpack later if needed - seq = handle.read(seq_len) # Leave as bytes for Seq object - quals = list(struct.unpack(temp_fmt, handle.read(seq_len))) - # now any padding... - padding = (read_flow_size + seq_len * 3) % 8 - if padding: - padding = 8 - padding - if handle.read(padding).count(_null) != padding: - import warnings - - from Bio import BiopythonParserWarning - - warnings.warn( - "Your SFF file is invalid, post quality %i " - "byte padding region contained data" % padding, - BiopythonParserWarning, - ) - # Follow Roche and apply most aggressive of qual and adapter clipping. - # Note Roche seems to ignore adapter clip fields when writing SFF, - # and uses just the quality clipping values for any clipping. - clip_left = max(clip_qual_left, clip_adapter_left) - # Right clipping of zero means no clipping - if clip_qual_right: - if clip_adapter_right: - clip_right = min(clip_qual_right, clip_adapter_right) - else: - # Typical case with Roche SFF files - clip_right = clip_qual_right - elif clip_adapter_right: - clip_right = clip_adapter_right - else: - clip_right = seq_len - # Now build a SeqRecord - if trim: - if clip_left >= clip_right: - # Raise an error? - import warnings - - from Bio import BiopythonParserWarning - - warnings.warn( - "Overlapping clip values in SFF record, trimmed to nothing", - BiopythonParserWarning, - ) - seq = "" - quals = [] - else: - seq = seq[clip_left:clip_right].upper() - quals = quals[clip_left:clip_right] - # Don't record the clipping values, flow etc, they make no sense now: - annotations = {} - else: - if clip_left >= clip_right: - import warnings - - from Bio import BiopythonParserWarning - - warnings.warn( - "Overlapping clip values in SFF record", BiopythonParserWarning - ) - seq = seq.lower() - else: - # This use of mixed case mimics the Roche SFF tool's FASTA output - seq = ( - seq[:clip_left].lower() - + seq[clip_left:clip_right].upper() - + seq[clip_right:].lower() - ) - annotations = { - "flow_values": struct.unpack(read_flow_fmt, flow_values), - "flow_index": struct.unpack(temp_fmt, flow_index), - "flow_chars": flow_chars, - "flow_key": key_sequence, - "clip_qual_left": clip_qual_left, - "clip_qual_right": clip_qual_right, - "clip_adapter_left": clip_adapter_left, - "clip_adapter_right": clip_adapter_right, - } - if re.match(_valid_UAN_read_name, name): - annotations["time"] = _get_read_time(name) - annotations["region"] = _get_read_region(name) - annotations["coords"] = _get_read_xy(name) - annotations["molecule_type"] = "DNA" - record = SeqRecord( - Seq(seq), id=name, name=name, description="", annotations=annotations - ) - # Dirty trick to speed up this line: - # record.letter_annotations["phred_quality"] = quals - dict.__setitem__(record._per_letter_annotations, "phred_quality", quals) - # Return the record and then continue... - return record - - -_powers_of_36 = [36**i for i in range(6)] - - def _string_as_base_36(string): """Interpret a string as a base-36 number as per 454 manual (PRIVATE).""" total = 0 @@ -895,38 +746,22 @@ def _sff_read_raw_record(handle, number_of_flows_per_read): return raw -class _AddTellHandle: - """Wrapper for handles which do not support the tell method (PRIVATE). - - Intended for use with things like network handles where tell (and reverse - seek) are not supported. The SFF file needs to track the current offset in - order to deal with the index block. - """ - - def __init__(self, handle): - self._handle = handle - self._offset = 0 - - def read(self, length): - data = self._handle.read(length) - self._offset += len(data) - return data - - def tell(self): - return self._offset - - def seek(self, offset): - if offset < self._offset: - raise RuntimeError("Can't seek backwards") - self._handle.read(offset - self._offset) - - def close(self): - return self._handle.close() - - class SffIterator(SequenceIterator): """Parser for Standard Flowgram Format (SFF) files.""" + # the read header format (fixed part): + # read_header_length H + # name_length H + # seq_len I + # clip_qual_left H + # clip_qual_right H + # clip_adapter_left H + # clip_adapter_right H + # [rest of read header depends on the name length etc] + read_header_fmt = ">2HI4H" + read_header_size = struct.calcsize(read_header_fmt) + assert read_header_size % 8 == 0 # Important for padding calc later! + def __init__(self, source, alphabet=None, trim=False): """Iterate over Standard Flowgram Format (SFF) reads (as SeqRecord objects). @@ -996,30 +831,51 @@ def __init__(self, source, alphabet=None, trim=False): raise ValueError("The alphabet argument is no longer supported") super().__init__(source, mode="b", fmt="SFF") self.trim = trim - - def parse(self, handle): - """Start parsing the file, and return a SeqRecord generator.""" - try: - if 0 != handle.tell(): - raise ValueError("Not at start of file, offset %i" % handle.tell()) - except AttributeError: - # Probably a network handle or something like that - handle = _AddTellHandle(handle) - records = self.iterate(handle) - return records - - def iterate(self, handle): - """Parse the file and generate SeqRecord objects.""" - trim = self.trim + stream = self.stream ( - header_length, - index_offset, - index_length, - number_of_reads, + self._offset, + self.index_offset, + self.index_length, + self.number_of_reads, number_of_flows_per_read, - flow_chars, - key_sequence, - ) = _sff_file_header(handle) + self.flow_chars, + self.key_sequence, + ) = _sff_file_header(stream) + # Now on to the reads... + self.read_flow_fmt = ">%iH" % number_of_flows_per_read + self.read_flow_size = struct.calcsize(self.read_flow_fmt) + self._read_counter = 0 + + def parse(self, handle): + """To be removed.""" + return + + def __next__(self): + stream = self.stream + if self._read_counter == self.number_of_reads: + self._read_counter = None # check EOF only once + self._check_eof(stream) + raise StopIteration + elif self._read_counter is None: + raise StopIteration + # The spec allows for the index block to be before or even in the middle + # of the reads. We can check that if we keep track of our position + # in the file... + index_offset = self.index_offset + if self._offset == index_offset: + index_length = self.index_length + offset = index_offset + index_length + if offset % 8: + offset += 8 - (offset % 8) + assert offset % 8 == 0 + stream.seek(offset) + self._offset = offset + record = self._sff_read_seq_record(self.stream) + self._read_counter += 1 + return record + + def _sff_read_seq_record(self, stream): + """Parse the next read in the file, return data as a SeqRecord (PRIVATE).""" # Now on to the reads... # the read header format (fixed part): # read_header_length H @@ -1030,112 +886,228 @@ def iterate(self, handle): # clip_adapter_left H # clip_adapter_right H # [rest of read header depends on the name length etc] - read_header_fmt = ">2HI4H" - read_header_size = struct.calcsize(read_header_fmt) - read_flow_fmt = ">%iH" % number_of_flows_per_read - read_flow_size = struct.calcsize(read_flow_fmt) - assert 1 == struct.calcsize(">B") - assert 1 == struct.calcsize(">s") - assert 1 == struct.calcsize(">c") - assert read_header_size % 8 == 0 # Important for padding calc later! - # The spec allows for the index block to be before or even in the middle - # of the reads. We can check that if we keep track of our position - # in the file... - for read in range(number_of_reads): - if index_offset and handle.tell() == index_offset: - offset = index_offset + index_length - if offset % 8: - offset += 8 - (offset % 8) - assert offset % 8 == 0 - handle.seek(offset) - # Now that we've done this, we don't need to do it again. Clear - # the index_offset so we can skip extra handle.tell() calls: - index_offset = 0 - yield _sff_read_seq_record( - handle, number_of_flows_per_read, flow_chars, key_sequence, trim + read_header_fmt = SffIterator.read_header_fmt + read_header_size = SffIterator.read_header_size + ( + read_header_length, + name_length, + seq_len, + clip_qual_left, + clip_qual_right, + clip_adapter_left, + clip_adapter_right, + ) = struct.unpack(read_header_fmt, stream.read(read_header_size)) + if clip_qual_left: + clip_qual_left -= 1 # python counting + if clip_adapter_left: + clip_adapter_left -= 1 # python counting + if read_header_length < 10 or read_header_length % 8 != 0: + raise ValueError( + "Malformed read header, says length is %i" % read_header_length ) - _check_eof(handle, index_offset, index_length) + # now the name and any padding (remainder of header) + name = stream.read(name_length).decode() + padding = read_header_length - read_header_size - name_length + if stream.read(padding).count(_null) != padding: + import warnings + from Bio import BiopythonParserWarning -def _check_eof(handle, index_offset, index_length): - """Check final padding is OK (8 byte alignment) and file ends (PRIVATE). + warnings.warn( + "Your SFF file is invalid, post name %i " + "byte padding region contained data" % padding, + BiopythonParserWarning, + ) + self._offset += read_header_length + # now the flowgram values, flowgram index, bases and qualities + # NOTE - assuming flowgram_format==1, which means struct type H + flow_values = stream.read(self.read_flow_size) # unpack later if needed + temp_fmt = ">%iB" % seq_len # used for flow index and quals + flow_index = stream.read(seq_len) # unpack later if needed + seq = stream.read(seq_len) # Leave as bytes for Seq object + quals = list(struct.unpack(temp_fmt, stream.read(seq_len))) + self._offset += self.read_flow_size + seq_len * 3 + # now any padding... + padding = (self.read_flow_size + seq_len * 3) % 8 + if padding: + padding = 8 - padding + if stream.read(padding).count(_null) != padding: + import warnings - Will attempt to spot apparent SFF file concatenation and give an error. + from Bio import BiopythonParserWarning - Will not attempt to seek, only moves the handle forward. - """ - offset = handle.tell() - extra = b"" - padding = 0 + warnings.warn( + "Your SFF file is invalid, post quality %i " + "byte padding region contained data" % padding, + BiopythonParserWarning, + ) + self._offset += padding + # Follow Roche and apply most aggressive of qual and adapter clipping. + # Note Roche seems to ignore adapter clip fields when writing SFF, + # and uses just the quality clipping values for any clipping. + clip_left = max(clip_qual_left, clip_adapter_left) + # Right clipping of zero means no clipping + if clip_qual_right: + if clip_adapter_right: + clip_right = min(clip_qual_right, clip_adapter_right) + else: + # Typical case with Roche SFF files + clip_right = clip_qual_right + elif clip_adapter_right: + clip_right = clip_adapter_right + else: + clip_right = seq_len + # Now build a SeqRecord + if self.trim: + if clip_left >= clip_right: + # Raise an error? + import warnings - if index_offset and offset <= index_offset: - # Index block then end of file... - if offset < index_offset: - raise ValueError( - "Gap of %i bytes after final record end %i, " - "before %i where index starts?" - % (index_offset - offset, offset, index_offset) - ) - # Doing read to jump the index rather than a seek - # in case this is a network handle or similar - handle.read(index_offset + index_length - offset) - offset = index_offset + index_length - if offset != handle.tell(): + from Bio import BiopythonParserWarning + + warnings.warn( + "Overlapping clip values in SFF record, trimmed to nothing", + BiopythonParserWarning, + ) + seq = "" + quals = [] + else: + seq = seq[clip_left:clip_right].upper() + quals = quals[clip_left:clip_right] + # Don't record the clipping values, flow etc, they make no sense now: + annotations = {} + else: + if clip_left >= clip_right: + import warnings + + from Bio import BiopythonParserWarning + + warnings.warn( + "Overlapping clip values in SFF record", BiopythonParserWarning + ) + seq = seq.lower() + else: + # This use of mixed case mimics the Roche SFF tool's FASTA output + seq = ( + seq[:clip_left].lower() + + seq[clip_left:clip_right].upper() + + seq[clip_right:].lower() + ) + annotations = { + "flow_values": struct.unpack(self.read_flow_fmt, flow_values), + "flow_index": struct.unpack(temp_fmt, flow_index), + "flow_chars": self.flow_chars, + "flow_key": self.key_sequence, + "clip_qual_left": clip_qual_left, + "clip_qual_right": clip_qual_right, + "clip_adapter_left": clip_adapter_left, + "clip_adapter_right": clip_adapter_right, + } + if re.match(_valid_UAN_read_name, name): + annotations["time"] = _get_read_time(name) + annotations["region"] = _get_read_region(name) + annotations["coords"] = _get_read_xy(name) + annotations["molecule_type"] = "DNA" + record = SeqRecord( + Seq(seq), id=name, name=name, description="", annotations=annotations + ) + # Dirty trick to speed up this line: + # record.letter_annotations["phred_quality"] = quals + dict.__setitem__(record._per_letter_annotations, "phred_quality", quals) + # Return the record and then continue... + return record + + def _check_eof(self, stream): + """Check final padding is OK (8 byte alignment) and file ends (PRIVATE). + + Will attempt to spot apparent SFF file concatenation and give an error. + + Will not attempt to seek, only moves the stream forward. + """ + offset = self._offset + extra = b"" + padding = 0 + + index_offset = self.index_offset + index_length = self.index_length + if index_offset and offset <= index_offset: + # Index block then end of file... + if offset < index_offset: + raise ValueError( + "Gap of %i bytes after final record end %i, " + "before %i where index starts?" + % (index_offset - offset, offset, index_offset) + ) + # Doing read to jump the index rather than a seek + # in case this is a network handle or similar + stream.read(index_length) + self._offset += index_length + offset = index_offset + index_length + if offset != self._offset: + raise ValueError( + "Wanted %i, got %i, index is %i to %i" + % (offset, stream.tell(), index_offset, index_offset + index_length) + ) + + if offset % 8: + padding = 8 - (offset % 8) + extra = stream.read(padding) + self._offset += padding + + if padding >= 4 and extra[-4:] == _sff: + # Seen this in one user supplied file, should have been + # four bytes of null padding but was actually .sff and + # the start of a new concatenated SFF file! raise ValueError( - "Wanted %i, got %i, index is %i to %i" - % (offset, handle.tell(), index_offset, index_offset + index_length) + "Your SFF file is invalid, post index %i byte " + "null padding region ended '.sff' which could " + "be the start of a concatenated SFF file? " + "See offset %i" % (padding, offset) ) + if padding and not extra: + # TODO - Is this error harmless enough to just ignore? + import warnings - if offset % 8: - padding = 8 - (offset % 8) - extra = handle.read(padding) + from Bio import BiopythonParserWarning - if padding >= 4 and extra[-4:] == _sff: - # Seen this in one user supplied file, should have been - # four bytes of null padding but was actually .sff and - # the start of a new concatenated SFF file! - raise ValueError( - "Your SFF file is invalid, post index %i byte " - "null padding region ended '.sff' which could " - "be the start of a concatenated SFF file? " - "See offset %i" % (padding, offset) - ) - if padding and not extra: - # TODO - Is this error harmless enough to just ignore? - import warnings + warnings.warn( + "Your SFF file is technically invalid as it is missing " + "a terminal %i byte null padding region." % padding, + BiopythonParserWarning, + ) + return + if extra.count(_null) != padding: + import warnings - from Bio import BiopythonParserWarning + from Bio import BiopythonParserWarning - warnings.warn( - "Your SFF file is technically invalid as it is missing " - "a terminal %i byte null padding region." % padding, - BiopythonParserWarning, - ) - return - if extra.count(_null) != padding: - import warnings + warnings.warn( + "Your SFF file is invalid, post index %i byte " + "null padding region contained data: %r" % (padding, extra), + BiopythonParserWarning, + ) - from Bio import BiopythonParserWarning + offset = self._offset + if offset % 8 != 0: + raise ValueError( + "Wanted offset %i %% 8 = %i to be zero" % (offset, offset % 8) + ) + # Should now be at the end of the file... + extra = stream.read(4) + self._offset += 4 + if extra == _sff: + raise ValueError( + "Additional data at end of SFF file, " + "perhaps multiple SFF files concatenated? " + "See offset %i" % offset + ) + elif extra: + raise ValueError( + "Additional data at end of SFF file, see offset %i" % offset + ) - warnings.warn( - "Your SFF file is invalid, post index %i byte " - "null padding region contained data: %r" % (padding, extra), - BiopythonParserWarning, - ) - offset = handle.tell() - if offset % 8 != 0: - raise ValueError("Wanted offset %i %% 8 = %i to be zero" % (offset, offset % 8)) - # Should now be at the end of the file... - extra = handle.read(4) - if extra == _sff: - raise ValueError( - "Additional data at end of SFF file, " - "perhaps multiple SFF files concatenated? " - "See offset %i" % offset - ) - elif extra: - raise ValueError("Additional data at end of SFF file, see offset %i" % offset) +_powers_of_36 = [36**i for i in range(6)] class _SffTrimIterator(SffIterator): diff --git a/Bio/SeqIO/_index.py b/Bio/SeqIO/_index.py index 69165affdb4..052151752ef 100644 --- a/Bio/SeqIO/_index.py +++ b/Bio/SeqIO/_index.py @@ -27,6 +27,7 @@ """ import re +import struct from io import BytesIO from io import StringIO @@ -67,13 +68,16 @@ def __init__(self, filename, format): SeqFileRandomAccess.__init__(self, filename, format) ( header_length, - index_offset, - index_length, + self.index_offset, + self.index_length, number_of_reads, - self._flows_per_read, - self._flow_chars, - self._key_sequence, + self.number_of_flows_per_read, + self.flow_chars, + self.key_sequence, ) = SeqIO.SffIO._sff_file_header(self._handle) + # Now on to the reads... + self.read_flow_fmt = ">%iH" % self.number_of_flows_per_read + self.read_flow_size = struct.calcsize(self.read_flow_fmt) def __iter__(self): """Load any index block in the file, or build it the slow way (PRIVATE).""" @@ -85,9 +89,9 @@ def __iter__(self): index_offset, index_length, number_of_reads, - self._flows_per_read, - self._flow_chars, - self._key_sequence, + self.number_of_flows_per_read, + self.flow_chars, + self.key_sequence, ) = SeqIO.SffIO._sff_file_header(handle) if index_offset and index_length: # There is an index provided, try this the fast way: @@ -120,9 +124,12 @@ def __iter__(self): # Can have an index at start (or mid-file) handle.seek(max_offset) # Parse the final read, - SeqIO.SffIO._sff_read_raw_record(handle, self._flows_per_read) + SeqIO.SffIO._sff_read_raw_record( + handle, self.number_of_flows_per_read + ) # Should now be at the end of the file! - SeqIO.SffIO._check_eof(handle, index_offset, index_length) + self._offset = handle.tell() + SeqIO.SffIO.SffIterator._check_eof(self, handle) return # We used to give a warning in this case, but Ion Torrent's # SFF files don't have an index so that would be annoying. @@ -135,21 +142,23 @@ def __iter__(self): raise ValueError( "Indexed %i records, expected %i" % (count, number_of_reads) ) - SeqIO.SffIO._check_eof(handle, index_offset, index_length) + self._offset = handle.tell() + SeqIO.SffIO.SffIterator._check_eof(self, handle) def get(self, offset): """Return the SeqRecord starting at the given offset.""" handle = self._handle handle.seek(offset) - return SeqIO.SffIO._sff_read_seq_record( - handle, self._flows_per_read, self._flow_chars, self._key_sequence - ) + self._offset = offset + self.trim = False + return SeqIO.SffIO.SffIterator._sff_read_seq_record(self, handle) def get_raw(self, offset): """Return the raw record from the file as a bytes string.""" handle = self._handle handle.seek(offset) - return SeqIO.SffIO._sff_read_raw_record(handle, self._flows_per_read) + self.stream = handle + return SeqIO.SffIO._sff_read_raw_record(handle, self.number_of_flows_per_read) class SffTrimedRandomAccess(SffRandomAccess): @@ -159,13 +168,9 @@ def get(self, offset): """Return the SeqRecord starting at the given offset.""" handle = self._handle handle.seek(offset) - return SeqIO.SffIO._sff_read_seq_record( - handle, - self._flows_per_read, - self._flow_chars, - self._key_sequence, - trim=True, - ) + self._offset = offset + self.trim = True + return SeqIO.SffIO.SffIterator._sff_read_seq_record(self, handle) ################### diff --git a/Tests/test_SffIO.py b/Tests/test_SeqIO_SffIO.py similarity index 100% rename from Tests/test_SffIO.py rename to Tests/test_SeqIO_SffIO.py diff --git a/Tests/test_SeqIO_index.py b/Tests/test_SeqIO_index.py index 591ea849637..512beaed077 100644 --- a/Tests/test_SeqIO_index.py +++ b/Tests/test_SeqIO_index.py @@ -635,20 +635,16 @@ def get_raw_check(self, filename, fmt, comp): else: raise RuntimeError(f"Unexpected mode {mode}") if fmt == "sff": - rec2 = SeqIO.SffIO._sff_read_seq_record( - handle, - rec_dict._proxy._flows_per_read, - rec_dict._proxy._flow_chars, - rec_dict._proxy._key_sequence, - trim=False, + rec_dict._proxy.trim = False + rec_dict._proxy._offset = handle.tell() + rec2 = SeqIO.SffIO.SffIterator._sff_read_seq_record( + rec_dict._proxy, handle ) elif fmt == "sff-trim": - rec2 = SeqIO.SffIO._sff_read_seq_record( - handle, - rec_dict._proxy._flows_per_read, - rec_dict._proxy._flow_chars, - rec_dict._proxy._key_sequence, - trim=True, + rec_dict._proxy.trim = True + rec_dict._proxy._offset = handle.tell() + rec2 = SeqIO.SffIO.SffIterator._sff_read_seq_record( + rec_dict._proxy, handle ) elif fmt == "uniprot-xml": self.assertTrue(raw.startswith(b"