Don't allow comments at the beginning of Fasta files (biopython#4780)

* update * rename fasta file * update * warning * add FastaBlastIterator, FastaPearsonIterator * revert fmt * update docs * spelling * more spelling --------- Co-authored-by: Michiel de Hoon <[email protected]>
gouttegd · Sep 5, 2024 · 378c4c1 · 378c4c1
1 parent 69b466f
commit 378c4c1
Show file tree

Hide file tree

Showing 10 changed files with 333 additions and 26 deletions.
diff --git a/Bio/SeqIO/FastaIO.py b/Bio/SeqIO/FastaIO.py
@@ -15,13 +15,17 @@
 
 from Bio.Seq import Seq
 from Bio.SeqRecord import SeqRecord
+from Bio import BiopythonDeprecationWarning
+
 
 from .Interfaces import _clean
 from .Interfaces import _get_seq_string
 from .Interfaces import _TextIOSource
 from .Interfaces import SequenceIterator
 from .Interfaces import SequenceWriter
 
+import warnings
+
 
 def SimpleFastaParser(handle):
     """Iterate over Fasta records as string tuples.
@@ -45,14 +49,44 @@ def SimpleFastaParser(handle):
     ('delta', 'CGCGC')
 
     """
-    # Skip any text before the first record (e.g. blank lines, comments)
-    for line in handle:
-        if line[0] == ">":
-            title = line[1:].rstrip()
-            break
-    else:
-        # no break encountered - probably an empty file
+    try:
+        line = next(handle)
+    except StopIteration:
         return
+    if not line.startswith(">"):
+        warnings.warn(
+            "Previously, the FASTA parser silently ignored comments at the "
+            "beginning of the FASTA file (before the first sequence).\n"
+            "\n"
+            "Nowadays, the FASTA file format is usually understood not to "
+            "have any such comments, and most software packages do not allow "
+            "them. Therefore, the use of comments at the beginning of a FASTA "
+            "file is now deprecated in Biopython.\n"
+            "\n"
+            "In a future Biopython release, this deprecation warning will be "
+            "replaced by a ValueError. To avoid this, there are three "
+            "options:\n"
+            "\n"
+            "(1) Modify your FASTA file to remove such comments at the "
+            "beginning of the file.\n"
+            "\n"
+            "(2) Use SeqIO.parse with the 'fasta-pearson' format instead of "
+            "'fasta'. This format is consistent with the FASTA format defined "
+            "by William Pearson's FASTA aligner software. Thie format allows "
+            "for comments before the first sequence; lines starting with the "
+            "';' character anywhere in the file are also regarded as comment "
+            "lines and are ignored.\n"
+            "\n"
+            "(3) Use the 'fasta-blast' format. This format regards any lines "
+            "starting with '!', '#', or ';' as comment lines. The "
+            "'fasta-blast' format may be safer than the 'fasta-pearson' "
+            "format, as it explicitly indicates which lines are comments. ",
+            BiopythonDeprecationWarning,
+        )
+        for line in handle:
+            if line.startswith(">"):
+                break
+    title = line[1:].rstrip()
 
     # Main logic
     # Note, remove trailing whitespace, and any internal spaces
@@ -137,7 +171,7 @@ def FastaTwoLineParser(handle):
 
 
 class FastaIterator(SequenceIterator):
-    """Parser for Fasta files."""
+    """Parser for plain Fasta files without comments."""
 
     def __init__(
         self,
@@ -150,6 +184,9 @@ def __init__(
          - source - input stream opened in text mode, or a path to a file
          - alphabet - optional alphabet, not used. Leave as None.
 
+        This parser expects a plain Fasta format without comments or header
+        lines.
+
         By default this will act like calling Bio.SeqIO.parse(handle, "fasta")
         with no custom handling of the title lines:
 
@@ -241,6 +278,209 @@ def iterate(self, handle):
             )
 
 
+class FastaBlastIterator(SequenceIterator):
+    """Parser for Fasta files, allowing for comments as in BLAST."""
+
+    def __init__(
+        self,
+        source: _TextIOSource,
+        alphabet: None = None,
+    ) -> None:
+        """Iterate over Fasta records as SeqRecord objects.
+
+        Arguments:
+         - source - input stream opened in text mode, or a path to a file
+         - alphabet - optional alphabet, not used. Leave as None.
+
+        This parser expects the data to be in FASTA format. As in BLAST, lines
+        starting with '#', '!', or ';' are interpreted as comments and ignored.
+
+        This iterator acts like calling Bio.SeqIO.parse(handle, "fasta-blast")
+        with no custom handling of the title lines:
+
+        >>> with open("Fasta/dups.fasta") as handle:
+        ...     for record in FastaIterator(handle):
+        ...         print(record.id)
+        ...
+        alpha
+        beta
+        gamma
+        alpha
+        delta
+
+        If you want to modify the records before writing, for example to change
+        the ID of each record, you can use a generator function as follows:
+
+        >>> def modify_records(records):
+        ...     for record in records:
+        ...         record.id = record.id.upper()
+        ...         yield record
+        ...
+        >>> with open('Fasta/dups.fasta') as handle:
+        ...     for record in modify_records(FastaIterator(handle)):
+        ...         print(record.id)
+        ...
+        ALPHA
+        BETA
+        GAMMA
+        ALPHA
+        DELTA
+
+        """
+        if alphabet is not None:
+            raise ValueError("The alphabet argument is no longer supported")
+        super().__init__(source, mode="t", fmt="FASTA")
+
+    def parse(self, handle):
+        """Start parsing the file, and return a SeqRecord generator."""
+        records = self.iterate(handle)
+        return records
+
+    def iterate(self, handle):
+        """Parse the file and generate SeqRecord objects."""
+        for line in handle:
+            if line[0] not in "#!;":
+                break
+        else:
+            return
+        if not line.startswith(">"):
+            raise ValueError(
+                "Expected FASTA record starting with '>' character.\n"
+                "If this line is a comment, please use '#', '!', or ';' as "
+                "the first character, or use the 'fasta-pearson' format for "
+                "parsing.\n"
+                f"Got: '{line}'"
+            )
+        title = line[1:].rstrip()
+        lines = []
+        for line in handle:
+            # Main logic
+            # Note, remove trailing whitespace, and any internal spaces
+            # (and any embedded \r which are possible in mangled files
+            # when not opened in universal read lines mode)
+            if line[0] in "#!;":
+                pass
+            elif line[0] == ">":
+                try:
+                    first_word = title.split(None, 1)[0]
+                except IndexError:
+                    first_word = ""
+                sequence = "".join(lines).replace(" ", "").replace("\r", "")
+                yield SeqRecord(
+                    Seq(sequence), id=first_word, name=first_word, description=title
+                )
+                lines = []
+                title = line[1:].rstrip()
+            else:
+                lines.append(line.rstrip())
+        try:
+            first_word = title.split(None, 1)[0]
+        except IndexError:
+            first_word = ""
+        sequence = "".join(lines).replace(" ", "").replace("\r", "")
+        yield SeqRecord(
+            Seq(sequence), id=first_word, name=first_word, description=title
+        )
+
+
+class FastaPearsonIterator(SequenceIterator):
+    """Parser for Fasta files, allowing for comments as in the FASTA aligner."""
+
+    def __init__(
+        self,
+        source: _TextIOSource,
+        alphabet: None = None,
+    ) -> None:
+        """Iterate over Fasta records as SeqRecord objects.
+
+        Arguments:
+         - source - input stream opened in text mode, or a path to a file
+         - alphabet - optional alphabet, not used. Leave as None.
+
+        This parser expects a Fasta format allowing for a header (before the
+        first sequence record) and comments (lines starting with ';') as in
+        William Pearson's FASTA aligner software.
+
+        This iterator acts as calling Bio.SeqIO.parse(handle, "fasta-pearson")
+        with no custom handling of the title lines:
+
+        >>> with open("Fasta/dups.fasta") as handle:
+        ...     for record in FastaIterator(handle):
+        ...         print(record.id)
+        ...
+        alpha
+        beta
+        gamma
+        alpha
+        delta
+
+        If you want to modify the records before writing, for example to change
+        the ID of each record, you can use a generator function as follows:
+
+        >>> def modify_records(records):
+        ...     for record in records:
+        ...         record.id = record.id.upper()
+        ...         yield record
+        ...
+        >>> with open('Fasta/dups.fasta') as handle:
+        ...     for record in modify_records(FastaIterator(handle)):
+        ...         print(record.id)
+        ...
+        ALPHA
+        BETA
+        GAMMA
+        ALPHA
+        DELTA
+
+        """
+        if alphabet is not None:
+            raise ValueError("The alphabet argument is no longer supported")
+        super().__init__(source, mode="t", fmt="Fasta")
+
+    def parse(self, handle):
+        """Start parsing the file, and return a SeqRecord generator."""
+        records = self.iterate(handle)
+        return records
+
+    def iterate(self, handle):
+        """Parse the file and generate SeqRecord objects."""
+        for line in handle:
+            if line.startswith(">"):
+                break
+        else:
+            return
+        title = line[1:].rstrip()
+        lines = []
+        for line in handle:
+            # Main logic
+            # Note, remove trailing whitespace, and any internal spaces
+            # (and any embedded \r which are possible in mangled files
+            # when not opened in universal read lines mode)
+            if line[0] == ";":
+                pass
+            elif line[0] == ">":
+                try:
+                    first_word = title.split(None, 1)[0]
+                except IndexError:
+                    first_word = ""
+                sequence = "".join(lines).replace(" ", "").replace("\r", "")
+                yield SeqRecord(
+                    Seq(sequence), id=first_word, name=first_word, description=title
+                )
+                lines = []
+                title = line[1:].rstrip()
+            else:
+                lines.append(line.rstrip())
+        try:
+            first_word = title.split(None, 1)[0]
+        except IndexError:
+            first_word = ""
+        sequence = "".join(lines).replace(" ", "").replace("\r", "")
+        yield SeqRecord(
+            Seq(sequence), id=first_word, name=first_word, description=title
+        )
+
+
 class FastaWriter(SequenceWriter):
     """Class to write Fasta format files (OBSOLETE).
 

diff --git a/Bio/SeqIO/__init__.py b/Bio/SeqIO/__init__.py
@@ -420,6 +420,8 @@
     "ace": AceIO.AceIterator,
     "fasta": FastaIO.FastaIterator,
     "fasta-2line": FastaIO.FastaTwoLineIterator,
+    "fasta-blast": FastaIO.FastaBlastIterator,
+    "fasta-pearson": FastaIO.FastaPearsonIterator,
     "ig": IgIO.IgIterator,
     "embl": InsdcIO.EmblIterator,
     "embl-cds": InsdcIO.EmblCdsFeatureIterator,

diff --git a/DEPRECATED.rst b/DEPRECATED.rst
@@ -60,6 +60,22 @@ was deprecated as of Release 1.70.
 Biopython modules, methods, functions
 =====================================
 
+Bio.SeqIO.FastaIO
+-----------------
+The ``SimpleFastaParser`` (which is used by Bio.SeqIO.parse if
+``format='fasta'`` or ``format='fasta-2line'``) interprets lines before the
+first line starting with '>' as comments and skips them. To be consistent with
+the most common interpretation of the FASTA file format, the use of such
+comment lines at the beginning of the FASTA file was deprecated in Biopython
+Release 1.85.
+As an alternative, you can use ``format='fasta-pearson'`` to specify the FASTA
+file format as defined by William Pearson's FASTA aligner program, allowing for
+comment lines at the top of the FASTA file (lines anywhere in the file starting
+by ';' are also regarded as comment lines and skipped).
+Another option is to use ``format='fasta-blast'``; this follows the FASTA file
+format accepted by BLAST, treating any lines starting with '#', ';', or '!' as
+comment lines and ignoring them.
+
 Bio.Entrez
 ----------
 The ``egquery`` function wrapping the NCBI EGQuery (Entrez Global Query)

diff --git a/Tests/Fasta/README b/Tests/Fasta/README
@@ -10,11 +10,11 @@ tools.  These are for tested in Bio.SeqIO and Bio.AlignIO
 (where the format is called "fasta") as well as other older
 parts of Biopython such as the Bio.Fasta module.
 
-ID     Description
-f001   1 protein sequence
-f002   3 DNA sequences
-f003   2 proteins, with comments
-fa01   fasta alignment
+ID        Description
+f001      1 protein sequence
+f002      3 DNA sequences
+f003.fa   2 proteins, with comments
+fa01      fasta alignment
 
 The following are example "machine readable" pairwise alignment
 output files from the FASTA tools when using the -m 10 command

diff --git a/Tests/Fasta/aster_blast.pro b/Tests/Fasta/aster_blast.pro
@@ -0,0 +1,7 @@
+# Some header
+>gi|3298468|dbj|BAA31520.1| SAMIPF
+#Some comment here
+GGHVNPAVTFGAFVGGNITLLRGIVYIIAQLLGSTVACLLLKFVTNDMAVGVFSLSAGVGVTNALVFEIV
+;Another comment here
+MTFGLVYTVYATAIDPKKGSLGTIAPIAIGFIVGANI
+!One more comment
diff --git a/Tests/Fasta/aster_pearson.pro b/Tests/Fasta/aster_pearson.pro
@@ -0,0 +1,7 @@
+Some header
+>gi|3298468|dbj|BAA31520.1| SAMIPF
+;Some comment here
+GGHVNPAVTFGAFVGGNITLLRGIVYIIAQLLGSTVACLLLKFVTNDMAVGVFSLSAGVGVTNALVFEIV
+;Another comment here
+MTFGLVYTVYATAIDPKKGSLGTIAPIAIGFIVGANI
+;One more comment
diff --git a/Tests/Fasta/f003 b/Tests/Fasta/f003
diff --git a/Tests/Fasta/f003.fa b/Tests/Fasta/f003.fa
@@ -0,0 +1,5 @@
+>gi|3318709|pdb|1A91|
+MENLNMDLLYMAAAVMMGLAAIGAAIGIGILGGKFLEGAARQPDLIPLLRTQFFIVMGLVDAIPMIAVGL
+GLYVMFAVA
+>gi|whatever|whatever
+MENLNMDLLYMAAAVMMGLAAIGAAIGIGILGG