From 378c4c124ea28f9cbdaee537cf52b803332fc6e9 Mon Sep 17 00:00:00 2001 From: mdehoon Date: Fri, 6 Sep 2024 02:44:40 +0900 Subject: [PATCH] Don't allow comments at the beginning of Fasta files (#4780) * update * rename fasta file * update * warning * add FastaBlastIterator, FastaPearsonIterator * revert fmt * update docs * spelling * more spelling --------- Co-authored-by: Michiel de Hoon --- Bio/SeqIO/FastaIO.py | 256 ++++++++++++++++++++++++++++++++-- Bio/SeqIO/__init__.py | 2 + DEPRECATED.rst | 16 +++ Tests/Fasta/README | 10 +- Tests/Fasta/aster_blast.pro | 7 + Tests/Fasta/aster_pearson.pro | 7 + Tests/Fasta/f003 | 10 -- Tests/Fasta/f003.fa | 5 + Tests/test_SeqIO_FastaIO.py | 42 +++++- Tests/test_Seq_objs.py | 4 +- 10 files changed, 333 insertions(+), 26 deletions(-) create mode 100644 Tests/Fasta/aster_blast.pro create mode 100644 Tests/Fasta/aster_pearson.pro delete mode 100644 Tests/Fasta/f003 create mode 100644 Tests/Fasta/f003.fa diff --git a/Bio/SeqIO/FastaIO.py b/Bio/SeqIO/FastaIO.py index 3bb04cf3a87..b5da78277d9 100644 --- a/Bio/SeqIO/FastaIO.py +++ b/Bio/SeqIO/FastaIO.py @@ -15,6 +15,8 @@ from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord +from Bio import BiopythonDeprecationWarning + from .Interfaces import _clean from .Interfaces import _get_seq_string @@ -22,6 +24,8 @@ from .Interfaces import SequenceIterator from .Interfaces import SequenceWriter +import warnings + def SimpleFastaParser(handle): """Iterate over Fasta records as string tuples. @@ -45,14 +49,44 @@ def SimpleFastaParser(handle): ('delta', 'CGCGC') """ - # Skip any text before the first record (e.g. blank lines, comments) - for line in handle: - if line[0] == ">": - title = line[1:].rstrip() - break - else: - # no break encountered - probably an empty file + try: + line = next(handle) + except StopIteration: return + if not line.startswith(">"): + warnings.warn( + "Previously, the FASTA parser silently ignored comments at the " + "beginning of the FASTA file (before the first sequence).\n" + "\n" + "Nowadays, the FASTA file format is usually understood not to " + "have any such comments, and most software packages do not allow " + "them. Therefore, the use of comments at the beginning of a FASTA " + "file is now deprecated in Biopython.\n" + "\n" + "In a future Biopython release, this deprecation warning will be " + "replaced by a ValueError. To avoid this, there are three " + "options:\n" + "\n" + "(1) Modify your FASTA file to remove such comments at the " + "beginning of the file.\n" + "\n" + "(2) Use SeqIO.parse with the 'fasta-pearson' format instead of " + "'fasta'. This format is consistent with the FASTA format defined " + "by William Pearson's FASTA aligner software. Thie format allows " + "for comments before the first sequence; lines starting with the " + "';' character anywhere in the file are also regarded as comment " + "lines and are ignored.\n" + "\n" + "(3) Use the 'fasta-blast' format. This format regards any lines " + "starting with '!', '#', or ';' as comment lines. The " + "'fasta-blast' format may be safer than the 'fasta-pearson' " + "format, as it explicitly indicates which lines are comments. ", + BiopythonDeprecationWarning, + ) + for line in handle: + if line.startswith(">"): + break + title = line[1:].rstrip() # Main logic # Note, remove trailing whitespace, and any internal spaces @@ -137,7 +171,7 @@ def FastaTwoLineParser(handle): class FastaIterator(SequenceIterator): - """Parser for Fasta files.""" + """Parser for plain Fasta files without comments.""" def __init__( self, @@ -150,6 +184,9 @@ def __init__( - source - input stream opened in text mode, or a path to a file - alphabet - optional alphabet, not used. Leave as None. + This parser expects a plain Fasta format without comments or header + lines. + By default this will act like calling Bio.SeqIO.parse(handle, "fasta") with no custom handling of the title lines: @@ -241,6 +278,209 @@ def iterate(self, handle): ) +class FastaBlastIterator(SequenceIterator): + """Parser for Fasta files, allowing for comments as in BLAST.""" + + def __init__( + self, + source: _TextIOSource, + alphabet: None = None, + ) -> None: + """Iterate over Fasta records as SeqRecord objects. + + Arguments: + - source - input stream opened in text mode, or a path to a file + - alphabet - optional alphabet, not used. Leave as None. + + This parser expects the data to be in FASTA format. As in BLAST, lines + starting with '#', '!', or ';' are interpreted as comments and ignored. + + This iterator acts like calling Bio.SeqIO.parse(handle, "fasta-blast") + with no custom handling of the title lines: + + >>> with open("Fasta/dups.fasta") as handle: + ... for record in FastaIterator(handle): + ... print(record.id) + ... + alpha + beta + gamma + alpha + delta + + If you want to modify the records before writing, for example to change + the ID of each record, you can use a generator function as follows: + + >>> def modify_records(records): + ... for record in records: + ... record.id = record.id.upper() + ... yield record + ... + >>> with open('Fasta/dups.fasta') as handle: + ... for record in modify_records(FastaIterator(handle)): + ... print(record.id) + ... + ALPHA + BETA + GAMMA + ALPHA + DELTA + + """ + if alphabet is not None: + raise ValueError("The alphabet argument is no longer supported") + super().__init__(source, mode="t", fmt="FASTA") + + def parse(self, handle): + """Start parsing the file, and return a SeqRecord generator.""" + records = self.iterate(handle) + return records + + def iterate(self, handle): + """Parse the file and generate SeqRecord objects.""" + for line in handle: + if line[0] not in "#!;": + break + else: + return + if not line.startswith(">"): + raise ValueError( + "Expected FASTA record starting with '>' character.\n" + "If this line is a comment, please use '#', '!', or ';' as " + "the first character, or use the 'fasta-pearson' format for " + "parsing.\n" + f"Got: '{line}'" + ) + title = line[1:].rstrip() + lines = [] + for line in handle: + # Main logic + # Note, remove trailing whitespace, and any internal spaces + # (and any embedded \r which are possible in mangled files + # when not opened in universal read lines mode) + if line[0] in "#!;": + pass + elif line[0] == ">": + try: + first_word = title.split(None, 1)[0] + except IndexError: + first_word = "" + sequence = "".join(lines).replace(" ", "").replace("\r", "") + yield SeqRecord( + Seq(sequence), id=first_word, name=first_word, description=title + ) + lines = [] + title = line[1:].rstrip() + else: + lines.append(line.rstrip()) + try: + first_word = title.split(None, 1)[0] + except IndexError: + first_word = "" + sequence = "".join(lines).replace(" ", "").replace("\r", "") + yield SeqRecord( + Seq(sequence), id=first_word, name=first_word, description=title + ) + + +class FastaPearsonIterator(SequenceIterator): + """Parser for Fasta files, allowing for comments as in the FASTA aligner.""" + + def __init__( + self, + source: _TextIOSource, + alphabet: None = None, + ) -> None: + """Iterate over Fasta records as SeqRecord objects. + + Arguments: + - source - input stream opened in text mode, or a path to a file + - alphabet - optional alphabet, not used. Leave as None. + + This parser expects a Fasta format allowing for a header (before the + first sequence record) and comments (lines starting with ';') as in + William Pearson's FASTA aligner software. + + This iterator acts as calling Bio.SeqIO.parse(handle, "fasta-pearson") + with no custom handling of the title lines: + + >>> with open("Fasta/dups.fasta") as handle: + ... for record in FastaIterator(handle): + ... print(record.id) + ... + alpha + beta + gamma + alpha + delta + + If you want to modify the records before writing, for example to change + the ID of each record, you can use a generator function as follows: + + >>> def modify_records(records): + ... for record in records: + ... record.id = record.id.upper() + ... yield record + ... + >>> with open('Fasta/dups.fasta') as handle: + ... for record in modify_records(FastaIterator(handle)): + ... print(record.id) + ... + ALPHA + BETA + GAMMA + ALPHA + DELTA + + """ + if alphabet is not None: + raise ValueError("The alphabet argument is no longer supported") + super().__init__(source, mode="t", fmt="Fasta") + + def parse(self, handle): + """Start parsing the file, and return a SeqRecord generator.""" + records = self.iterate(handle) + return records + + def iterate(self, handle): + """Parse the file and generate SeqRecord objects.""" + for line in handle: + if line.startswith(">"): + break + else: + return + title = line[1:].rstrip() + lines = [] + for line in handle: + # Main logic + # Note, remove trailing whitespace, and any internal spaces + # (and any embedded \r which are possible in mangled files + # when not opened in universal read lines mode) + if line[0] == ";": + pass + elif line[0] == ">": + try: + first_word = title.split(None, 1)[0] + except IndexError: + first_word = "" + sequence = "".join(lines).replace(" ", "").replace("\r", "") + yield SeqRecord( + Seq(sequence), id=first_word, name=first_word, description=title + ) + lines = [] + title = line[1:].rstrip() + else: + lines.append(line.rstrip()) + try: + first_word = title.split(None, 1)[0] + except IndexError: + first_word = "" + sequence = "".join(lines).replace(" ", "").replace("\r", "") + yield SeqRecord( + Seq(sequence), id=first_word, name=first_word, description=title + ) + + class FastaWriter(SequenceWriter): """Class to write Fasta format files (OBSOLETE). diff --git a/Bio/SeqIO/__init__.py b/Bio/SeqIO/__init__.py index 08e7dc2917e..a4de9cc8342 100644 --- a/Bio/SeqIO/__init__.py +++ b/Bio/SeqIO/__init__.py @@ -420,6 +420,8 @@ "ace": AceIO.AceIterator, "fasta": FastaIO.FastaIterator, "fasta-2line": FastaIO.FastaTwoLineIterator, + "fasta-blast": FastaIO.FastaBlastIterator, + "fasta-pearson": FastaIO.FastaPearsonIterator, "ig": IgIO.IgIterator, "embl": InsdcIO.EmblIterator, "embl-cds": InsdcIO.EmblCdsFeatureIterator, diff --git a/DEPRECATED.rst b/DEPRECATED.rst index 460e802ac77..78989cf6c60 100644 --- a/DEPRECATED.rst +++ b/DEPRECATED.rst @@ -60,6 +60,22 @@ was deprecated as of Release 1.70. Biopython modules, methods, functions ===================================== +Bio.SeqIO.FastaIO +----------------- +The ``SimpleFastaParser`` (which is used by Bio.SeqIO.parse if +``format='fasta'`` or ``format='fasta-2line'``) interprets lines before the +first line starting with '>' as comments and skips them. To be consistent with +the most common interpretation of the FASTA file format, the use of such +comment lines at the beginning of the FASTA file was deprecated in Biopython +Release 1.85. +As an alternative, you can use ``format='fasta-pearson'`` to specify the FASTA +file format as defined by William Pearson's FASTA aligner program, allowing for +comment lines at the top of the FASTA file (lines anywhere in the file starting +by ';' are also regarded as comment lines and skipped). +Another option is to use ``format='fasta-blast'``; this follows the FASTA file +format accepted by BLAST, treating any lines starting with '#', ';', or '!' as +comment lines and ignoring them. + Bio.Entrez ---------- The ``egquery`` function wrapping the NCBI EGQuery (Entrez Global Query) diff --git a/Tests/Fasta/README b/Tests/Fasta/README index e8512fcc44a..a46ca466b3b 100644 --- a/Tests/Fasta/README +++ b/Tests/Fasta/README @@ -10,11 +10,11 @@ tools. These are for tested in Bio.SeqIO and Bio.AlignIO (where the format is called "fasta") as well as other older parts of Biopython such as the Bio.Fasta module. -ID Description -f001 1 protein sequence -f002 3 DNA sequences -f003 2 proteins, with comments -fa01 fasta alignment +ID Description +f001 1 protein sequence +f002 3 DNA sequences +f003.fa 2 proteins, with comments +fa01 fasta alignment The following are example "machine readable" pairwise alignment output files from the FASTA tools when using the -m 10 command diff --git a/Tests/Fasta/aster_blast.pro b/Tests/Fasta/aster_blast.pro new file mode 100644 index 00000000000..241d1567953 --- /dev/null +++ b/Tests/Fasta/aster_blast.pro @@ -0,0 +1,7 @@ +# Some header +>gi|3298468|dbj|BAA31520.1| SAMIPF +#Some comment here +GGHVNPAVTFGAFVGGNITLLRGIVYIIAQLLGSTVACLLLKFVTNDMAVGVFSLSAGVGVTNALVFEIV +;Another comment here +MTFGLVYTVYATAIDPKKGSLGTIAPIAIGFIVGANI +!One more comment diff --git a/Tests/Fasta/aster_pearson.pro b/Tests/Fasta/aster_pearson.pro new file mode 100644 index 00000000000..7fed5508d5c --- /dev/null +++ b/Tests/Fasta/aster_pearson.pro @@ -0,0 +1,7 @@ +Some header +>gi|3298468|dbj|BAA31520.1| SAMIPF +;Some comment here +GGHVNPAVTFGAFVGGNITLLRGIVYIIAQLLGSTVACLLLKFVTNDMAVGVFSLSAGVGVTNALVFEIV +;Another comment here +MTFGLVYTVYATAIDPKKGSLGTIAPIAIGFIVGANI +;One more comment diff --git a/Tests/Fasta/f003 b/Tests/Fasta/f003 deleted file mode 100644 index 5dd53dc820c..00000000000 --- a/Tests/Fasta/f003 +++ /dev/null @@ -1,10 +0,0 @@ -# This file has comments in it ->gi|3318709|pdb|1A91| -MENLNMDLLYMAAAVMMGLAAIGAAIGIGILGGKFLEGAARQPDLIPLLRTQFFIVMGLVDAIPMIAVGL -GLYVMFAVA -# I don't really know if FASTA format is supposed to support this -# but we might be able to without many problems ->gi|whatever|whatever -MENLNMDLLYMAAAVMMGLAAIGAAIGIGILGG -# More comments at the end of the file -# Man, this file has more comments than sequence diff --git a/Tests/Fasta/f003.fa b/Tests/Fasta/f003.fa new file mode 100644 index 00000000000..3d1a58da2a4 --- /dev/null +++ b/Tests/Fasta/f003.fa @@ -0,0 +1,5 @@ +>gi|3318709|pdb|1A91| +MENLNMDLLYMAAAVMMGLAAIGAAIGIGILGGKFLEGAARQPDLIPLLRTQFFIVMGLVDAIPMIAVGL +GLYVMFAVA +>gi|whatever|whatever +MENLNMDLLYMAAAVMMGLAAIGAAIGIGILGG diff --git a/Tests/test_SeqIO_FastaIO.py b/Tests/test_SeqIO_FastaIO.py index 094a5acfa8b..40c003a9e47 100644 --- a/Tests/test_SeqIO_FastaIO.py +++ b/Tests/test_SeqIO_FastaIO.py @@ -9,10 +9,11 @@ from io import StringIO from Bio import SeqIO -from Bio.SeqIO.FastaIO import FastaIterator from Bio.SeqIO.FastaIO import FastaTwoLineParser from Bio.SeqIO.FastaIO import SimpleFastaParser +from Bio import BiopythonDeprecationWarning + def title_to_ids(title): """Convert a FASTA title line into the id, name, and description. @@ -192,6 +193,45 @@ def test_exceptions_FastaTwoLineParser(self): list(FastaTwoLineParser(handle)) +class TestFastaWithComments(unittest.TestCase): + """Test FastaBlastIterator and FastaPearsonIterator.""" + + expected = SeqIO.read("Fasta/aster.pro", "fasta") + + def test_fasta_blast(self): + """Test FastaBlastIterator.""" + + expected = self.expected + + record = SeqIO.read("Fasta/aster_blast.pro", "fasta-blast") + self.assertEqual(expected.id, record.id) + self.assertEqual(expected.name, record.name) + self.assertEqual(expected.description, record.description) + self.assertEqual(expected.seq, record.seq) + + def test_fasta_pearson(self): + """Test FastaPearsonIterator.""" + + expected = self.expected + + record = SeqIO.read("Fasta/aster_pearson.pro", "fasta-pearson") + self.assertEqual(expected.id, record.id) + self.assertEqual(expected.name, record.name) + self.assertEqual(expected.description, record.description) + self.assertEqual(expected.seq, record.seq) + + def test_valueerrors(self): + """Test if ValueErrors are raised if comments are found unexpectedly.""" + + self.assertRaises( + ValueError, SeqIO.read, "Fasta/aster_pearson.pro", "fasta-blast" + ) + with self.assertWarns(BiopythonDeprecationWarning): + record = SeqIO.read("Fasta/aster_pearson.pro", "fasta") + with self.assertWarns(BiopythonDeprecationWarning): + record = SeqIO.read("Fasta/aster_blast.pro", "fasta") + + if __name__ == "__main__": runner = unittest.TextTestRunner(verbosity=2) unittest.main(testRunner=runner) diff --git a/Tests/test_Seq_objs.py b/Tests/test_Seq_objs.py index 55300c63262..9aafed15909 100644 --- a/Tests/test_Seq_objs.py +++ b/Tests/test_Seq_objs.py @@ -965,7 +965,7 @@ def test_join_MutableSeq_mixed(self): def test_join_Seq_with_file(self): """Checks if Seq join correctly concatenates sequence from a file with the spacer.""" - filename = "Fasta/f003" + filename = "Fasta/f003.fa" seqlist = [record.seq for record in SeqIO.parse(filename, "fasta")] seqlist_as_strings = [str(_) for _ in seqlist] @@ -1005,7 +1005,7 @@ def test_join_MutableSeq(self): def test_join_MutableSeq_with_file(self): """Checks if MutableSeq join correctly concatenates sequence from a file with the spacer.""" - filename = "Fasta/f003" + filename = "Fasta/f003.fa" seqlist = [record.seq for record in SeqIO.parse(filename, "fasta")] seqlist_as_strings = [str(_) for _ in seqlist]