Skip to content

Commit

Permalink
Don't allow comments at the beginning of Fasta files (biopython#4780)
Browse files Browse the repository at this point in the history
* update

* rename fasta file

* update

* warning

* add FastaBlastIterator, FastaPearsonIterator

* revert fmt

* update docs

* spelling

* more spelling

---------

Co-authored-by: Michiel de Hoon <[email protected]>
  • Loading branch information
mdehoon and Michiel de Hoon authored Sep 5, 2024
1 parent 69b466f commit 378c4c1
Show file tree
Hide file tree
Showing 10 changed files with 333 additions and 26 deletions.
256 changes: 248 additions & 8 deletions Bio/SeqIO/FastaIO.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,17 @@

from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import BiopythonDeprecationWarning


from .Interfaces import _clean
from .Interfaces import _get_seq_string
from .Interfaces import _TextIOSource
from .Interfaces import SequenceIterator
from .Interfaces import SequenceWriter

import warnings


def SimpleFastaParser(handle):
"""Iterate over Fasta records as string tuples.
Expand All @@ -45,14 +49,44 @@ def SimpleFastaParser(handle):
('delta', 'CGCGC')
"""
# Skip any text before the first record (e.g. blank lines, comments)
for line in handle:
if line[0] == ">":
title = line[1:].rstrip()
break
else:
# no break encountered - probably an empty file
try:
line = next(handle)
except StopIteration:
return
if not line.startswith(">"):
warnings.warn(
"Previously, the FASTA parser silently ignored comments at the "
"beginning of the FASTA file (before the first sequence).\n"
"\n"
"Nowadays, the FASTA file format is usually understood not to "
"have any such comments, and most software packages do not allow "
"them. Therefore, the use of comments at the beginning of a FASTA "
"file is now deprecated in Biopython.\n"
"\n"
"In a future Biopython release, this deprecation warning will be "
"replaced by a ValueError. To avoid this, there are three "
"options:\n"
"\n"
"(1) Modify your FASTA file to remove such comments at the "
"beginning of the file.\n"
"\n"
"(2) Use SeqIO.parse with the 'fasta-pearson' format instead of "
"'fasta'. This format is consistent with the FASTA format defined "
"by William Pearson's FASTA aligner software. Thie format allows "
"for comments before the first sequence; lines starting with the "
"';' character anywhere in the file are also regarded as comment "
"lines and are ignored.\n"
"\n"
"(3) Use the 'fasta-blast' format. This format regards any lines "
"starting with '!', '#', or ';' as comment lines. The "
"'fasta-blast' format may be safer than the 'fasta-pearson' "
"format, as it explicitly indicates which lines are comments. ",
BiopythonDeprecationWarning,
)
for line in handle:
if line.startswith(">"):
break
title = line[1:].rstrip()

# Main logic
# Note, remove trailing whitespace, and any internal spaces
Expand Down Expand Up @@ -137,7 +171,7 @@ def FastaTwoLineParser(handle):


class FastaIterator(SequenceIterator):
"""Parser for Fasta files."""
"""Parser for plain Fasta files without comments."""

def __init__(
self,
Expand All @@ -150,6 +184,9 @@ def __init__(
- source - input stream opened in text mode, or a path to a file
- alphabet - optional alphabet, not used. Leave as None.
This parser expects a plain Fasta format without comments or header
lines.
By default this will act like calling Bio.SeqIO.parse(handle, "fasta")
with no custom handling of the title lines:
Expand Down Expand Up @@ -241,6 +278,209 @@ def iterate(self, handle):
)


class FastaBlastIterator(SequenceIterator):
"""Parser for Fasta files, allowing for comments as in BLAST."""

def __init__(
self,
source: _TextIOSource,
alphabet: None = None,
) -> None:
"""Iterate over Fasta records as SeqRecord objects.
Arguments:
- source - input stream opened in text mode, or a path to a file
- alphabet - optional alphabet, not used. Leave as None.
This parser expects the data to be in FASTA format. As in BLAST, lines
starting with '#', '!', or ';' are interpreted as comments and ignored.
This iterator acts like calling Bio.SeqIO.parse(handle, "fasta-blast")
with no custom handling of the title lines:
>>> with open("Fasta/dups.fasta") as handle:
... for record in FastaIterator(handle):
... print(record.id)
...
alpha
beta
gamma
alpha
delta
If you want to modify the records before writing, for example to change
the ID of each record, you can use a generator function as follows:
>>> def modify_records(records):
... for record in records:
... record.id = record.id.upper()
... yield record
...
>>> with open('Fasta/dups.fasta') as handle:
... for record in modify_records(FastaIterator(handle)):
... print(record.id)
...
ALPHA
BETA
GAMMA
ALPHA
DELTA
"""
if alphabet is not None:
raise ValueError("The alphabet argument is no longer supported")
super().__init__(source, mode="t", fmt="FASTA")

def parse(self, handle):
"""Start parsing the file, and return a SeqRecord generator."""
records = self.iterate(handle)
return records

def iterate(self, handle):
"""Parse the file and generate SeqRecord objects."""
for line in handle:
if line[0] not in "#!;":
break
else:
return
if not line.startswith(">"):
raise ValueError(
"Expected FASTA record starting with '>' character.\n"
"If this line is a comment, please use '#', '!', or ';' as "
"the first character, or use the 'fasta-pearson' format for "
"parsing.\n"
f"Got: '{line}'"
)
title = line[1:].rstrip()
lines = []
for line in handle:
# Main logic
# Note, remove trailing whitespace, and any internal spaces
# (and any embedded \r which are possible in mangled files
# when not opened in universal read lines mode)
if line[0] in "#!;":
pass
elif line[0] == ">":
try:
first_word = title.split(None, 1)[0]
except IndexError:
first_word = ""
sequence = "".join(lines).replace(" ", "").replace("\r", "")
yield SeqRecord(
Seq(sequence), id=first_word, name=first_word, description=title
)
lines = []
title = line[1:].rstrip()
else:
lines.append(line.rstrip())
try:
first_word = title.split(None, 1)[0]
except IndexError:
first_word = ""
sequence = "".join(lines).replace(" ", "").replace("\r", "")
yield SeqRecord(
Seq(sequence), id=first_word, name=first_word, description=title
)


class FastaPearsonIterator(SequenceIterator):
"""Parser for Fasta files, allowing for comments as in the FASTA aligner."""

def __init__(
self,
source: _TextIOSource,
alphabet: None = None,
) -> None:
"""Iterate over Fasta records as SeqRecord objects.
Arguments:
- source - input stream opened in text mode, or a path to a file
- alphabet - optional alphabet, not used. Leave as None.
This parser expects a Fasta format allowing for a header (before the
first sequence record) and comments (lines starting with ';') as in
William Pearson's FASTA aligner software.
This iterator acts as calling Bio.SeqIO.parse(handle, "fasta-pearson")
with no custom handling of the title lines:
>>> with open("Fasta/dups.fasta") as handle:
... for record in FastaIterator(handle):
... print(record.id)
...
alpha
beta
gamma
alpha
delta
If you want to modify the records before writing, for example to change
the ID of each record, you can use a generator function as follows:
>>> def modify_records(records):
... for record in records:
... record.id = record.id.upper()
... yield record
...
>>> with open('Fasta/dups.fasta') as handle:
... for record in modify_records(FastaIterator(handle)):
... print(record.id)
...
ALPHA
BETA
GAMMA
ALPHA
DELTA
"""
if alphabet is not None:
raise ValueError("The alphabet argument is no longer supported")
super().__init__(source, mode="t", fmt="Fasta")

def parse(self, handle):
"""Start parsing the file, and return a SeqRecord generator."""
records = self.iterate(handle)
return records

def iterate(self, handle):
"""Parse the file and generate SeqRecord objects."""
for line in handle:
if line.startswith(">"):
break
else:
return
title = line[1:].rstrip()
lines = []
for line in handle:
# Main logic
# Note, remove trailing whitespace, and any internal spaces
# (and any embedded \r which are possible in mangled files
# when not opened in universal read lines mode)
if line[0] == ";":
pass
elif line[0] == ">":
try:
first_word = title.split(None, 1)[0]
except IndexError:
first_word = ""
sequence = "".join(lines).replace(" ", "").replace("\r", "")
yield SeqRecord(
Seq(sequence), id=first_word, name=first_word, description=title
)
lines = []
title = line[1:].rstrip()
else:
lines.append(line.rstrip())
try:
first_word = title.split(None, 1)[0]
except IndexError:
first_word = ""
sequence = "".join(lines).replace(" ", "").replace("\r", "")
yield SeqRecord(
Seq(sequence), id=first_word, name=first_word, description=title
)


class FastaWriter(SequenceWriter):
"""Class to write Fasta format files (OBSOLETE).
Expand Down
2 changes: 2 additions & 0 deletions Bio/SeqIO/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -420,6 +420,8 @@
"ace": AceIO.AceIterator,
"fasta": FastaIO.FastaIterator,
"fasta-2line": FastaIO.FastaTwoLineIterator,
"fasta-blast": FastaIO.FastaBlastIterator,
"fasta-pearson": FastaIO.FastaPearsonIterator,
"ig": IgIO.IgIterator,
"embl": InsdcIO.EmblIterator,
"embl-cds": InsdcIO.EmblCdsFeatureIterator,
Expand Down
16 changes: 16 additions & 0 deletions DEPRECATED.rst
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,22 @@ was deprecated as of Release 1.70.
Biopython modules, methods, functions
=====================================

Bio.SeqIO.FastaIO
-----------------
The ``SimpleFastaParser`` (which is used by Bio.SeqIO.parse if
``format='fasta'`` or ``format='fasta-2line'``) interprets lines before the
first line starting with '>' as comments and skips them. To be consistent with
the most common interpretation of the FASTA file format, the use of such
comment lines at the beginning of the FASTA file was deprecated in Biopython
Release 1.85.
As an alternative, you can use ``format='fasta-pearson'`` to specify the FASTA
file format as defined by William Pearson's FASTA aligner program, allowing for
comment lines at the top of the FASTA file (lines anywhere in the file starting
by ';' are also regarded as comment lines and skipped).
Another option is to use ``format='fasta-blast'``; this follows the FASTA file
format accepted by BLAST, treating any lines starting with '#', ';', or '!' as
comment lines and ignoring them.

Bio.Entrez
----------
The ``egquery`` function wrapping the NCBI EGQuery (Entrez Global Query)
Expand Down
10 changes: 5 additions & 5 deletions Tests/Fasta/README
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,11 @@ tools. These are for tested in Bio.SeqIO and Bio.AlignIO
(where the format is called "fasta") as well as other older
parts of Biopython such as the Bio.Fasta module.

ID Description
f001 1 protein sequence
f002 3 DNA sequences
f003 2 proteins, with comments
fa01 fasta alignment
ID Description
f001 1 protein sequence
f002 3 DNA sequences
f003.fa 2 proteins, with comments
fa01 fasta alignment

The following are example "machine readable" pairwise alignment
output files from the FASTA tools when using the -m 10 command
Expand Down
7 changes: 7 additions & 0 deletions Tests/Fasta/aster_blast.pro
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Some header
>gi|3298468|dbj|BAA31520.1| SAMIPF
#Some comment here
GGHVNPAVTFGAFVGGNITLLRGIVYIIAQLLGSTVACLLLKFVTNDMAVGVFSLSAGVGVTNALVFEIV
;Another comment here
MTFGLVYTVYATAIDPKKGSLGTIAPIAIGFIVGANI
!One more comment
7 changes: 7 additions & 0 deletions Tests/Fasta/aster_pearson.pro
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
Some header
>gi|3298468|dbj|BAA31520.1| SAMIPF
;Some comment here
GGHVNPAVTFGAFVGGNITLLRGIVYIIAQLLGSTVACLLLKFVTNDMAVGVFSLSAGVGVTNALVFEIV
;Another comment here
MTFGLVYTVYATAIDPKKGSLGTIAPIAIGFIVGANI
;One more comment
10 changes: 0 additions & 10 deletions Tests/Fasta/f003

This file was deleted.

5 changes: 5 additions & 0 deletions Tests/Fasta/f003.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
>gi|3318709|pdb|1A91|
MENLNMDLLYMAAAVMMGLAAIGAAIGIGILGGKFLEGAARQPDLIPLLRTQFFIVMGLVDAIPMIAVGL
GLYVMFAVA
>gi|whatever|whatever
MENLNMDLLYMAAAVMMGLAAIGAAIGIGILGG
Loading

0 comments on commit 378c4c1

Please sign in to comment.