Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix get_proteins_by_id to avoid overwriting protein_id features #79

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions deepbgc/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -502,6 +502,7 @@ def fix_duplicate_cds(record):

if protein_id != new_protein_id:
logging.warning('Setting new unique_protein_id %s for CDS %s', new_protein_id, protein_id)
del feature.qualifiers['protein_id']
feature.qualifiers['unique_protein_id'] = [new_protein_id]

protein_ids.add(new_protein_id)
Expand Down
251 changes: 251 additions & 0 deletions test/unit/test_pfam.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,251 @@
import pytest
from pathlib import Path
from io import StringIO
import pandas as pd
from copy import deepcopy
from Bio import SearchIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation
from deepbgc.pipeline.pfam import HmmscanPfamRecordAnnotator
import deepbgc.util as bgc_util
from tempfile import TemporaryDirectory


@pytest.fixture
def mock_find_exe(mocker):
mock = mocker.patch('deepbgc.pipeline.pfam.find_executable')
mock.return_value = 'hmmscan'
mock.start()
yield mock
mock.stop()


@pytest.fixture
def mock_popen(mocker):

class MockPopen:

returncode = 0

def __init__(self, *arg, **kwargs):
pass

@staticmethod
def communicate():
return '', ''

mock = mocker.patch('deepbgc.pipeline.pfam.subprocess.Popen')
mock.return_value = MockPopen
mock.start()
yield mock
mock.stop()


@pytest.fixture
def mock_searchio(mocker):
mock = mocker.patch('deepbgc.pipeline.pfam.SearchIO')
mock.start()
yield mock
mock.stop()


@pytest.fixture
def mock_read_csv(mocker):
mock = mocker.patch('deepbgc.pipeline.pfam.pd.read_csv')
mock.return_value = pd.DataFrame().from_dict(
{'pfam_id': [], 'clan_id': [], 'clan_name': [], 'pfam_name': [], 'description': []},
)
mock.start()
yield mock
mock.stop()


def test_basic_scenario(mock_find_exe, mock_popen, mock_searchio, mock_read_csv):
"""
basic scenario where there are two CDS with different protein_id qualifiers
"""

record = SeqRecord(
Seq("atgc" * 50),
id="contig_1",
name="contig_1",
description="mock record",
annotations={"molecule_type": "DNA"},
features=[
SeqFeature(
FeatureLocation(2, 11, strand=1),
id='FOOBAR_1000',
type="CDS",
qualifiers={
'locus_tag': ['FOOBAR_1000'],
'protein_id': ['AAK73500.1'],
}
),
SeqFeature(
FeatureLocation(15, 36, strand=1),
id='FOOBAR_1005',
type="CDS",
qualifiers={
'locus_tag': ['FOOBAR_1005'],
'protein_id': ['AAK73498.1'],
}
),
]
)

expected_record = deepcopy(record)
expected_record.features += [
SeqFeature(
FeatureLocation(5, 11, strand=1),
type="PFAM_domain",
id='PF06970.10',
qualifiers={
'db_xref': ['PF06970.10'],
'evalue': [1.2e-28],
'locus_tag': ['AAK73500.1'],
'database': ['31.0'],
}
),
SeqFeature(
FeatureLocation(18, 36, strand=1),
type="PFAM_domain",
id='PF01076.18',
qualifiers={
'db_xref': ['PF01076.18'],
'evalue': [6.9e-52],
'locus_tag': ['AAK73498.1'],
'database': ['31.0'],
}
),
]

expected_record.features = sorted(
expected_record.features,
key=lambda feature: (feature.location.start, -feature.location.end, bgc_util.FEATURE_ORDER.get(feature.type, 0))
)

data = \
"""# --- full sequence --- -------------- this domain ------------- hmm coord ali coord env coord
# target name accession tlen query name accession qlen E-value score bias # of c-Evalue i-Evalue score bias from to from to from to acc description of target
#------------------- ---------- ----- -------------------- ---------- ----- --------- ------ ----- --- --- --------- --------- ------ ----- ----- ----- ----- ----- ----- ----- ---- ---------------------
RepA_N PF06970.10 12 AAK73500.1 - 12 8.1e-29 99.3 0.0 1 1 7e-33 1.2e-28 98.8 0.0 2 12 2 12 2 12 0.99 Replication initiator protein A (RepA) N-terminus
Mob_Pre PF01076.18 12 AAK73498.1 - 12 6.9e-52 176.0 1.7 1 2 4.9e-55 6.9e-52 176.0 1.7 2 12 2 12 2 12 0.98 Plasmid recombination enzyme
Mob_Pre PF01076.18 12 AAK73498.1 - 12 6.9e-52 176.0 1.7 2 2 3 4.1e+03 -2.7 0.1 2 12 2 12 2 12 0.74 Plasmid recombination enzyme
"""
mock_searchio.parse.return_value = SearchIO.parse(StringIO(data), 'hmmscan3-domtab')

assert record.format('genbank') != expected_record.format('genbank')

# record fixing happening before running pfam annotation
bgc_util.fix_record_locus(record)
bgc_util.fix_duplicate_cds(record)
bgc_util.fix_dna_alphabet(record)

with TemporaryDirectory() as tmp_dir:
out_file = Path(tmp_dir).joinpath('hmmscan_out.pfam.domtbl.txt')
out_file.touch()
tmp_path_prefix = str(out_file.parent.joinpath(out_file.stem.split('.')[0]))
annotator = HmmscanPfamRecordAnnotator(record, tmp_path_prefix, db_path='/mock/db', clans_path='/mock/clans')

assert isinstance(annotator, HmmscanPfamRecordAnnotator)

annotator.annotate()

assert record.format('genbank') == expected_record.format('genbank')


def test_same_protein_id(mock_find_exe, mock_popen, mock_searchio, mock_read_csv):
"""
scenario where there are two CDS with same protein_id qualifiers
"""
record = SeqRecord(
Seq("atgc" * 50),
id="contig_1",
name="contig_1",
description="mock record",
annotations={"molecule_type": "DNA"},
features=[
SeqFeature(
FeatureLocation(2, 11, strand=1),
id='FOOBAR_1000',
type="CDS",
qualifiers={
'locus_tag': ['FOOBAR_1000'],
'protein_id': ['AAK73500.1'],
}
),
SeqFeature(
FeatureLocation(15, 36, strand=1),
id='FOOBAR_1005',
type="CDS",
qualifiers={
'locus_tag': ['FOOBAR_1005'],
'protein_id': ['AAK73500.1'],
}
),
]
)

expected_record = deepcopy(record)
del expected_record.features[-1].qualifiers['protein_id']
expected_record.features[-1].qualifiers['unique_protein_id'] = 'AAK73500.1_1'
expected_record.features += [
SeqFeature(
FeatureLocation(5, 11, strand=1),
type="PFAM_domain",
id='PF06970.10',
qualifiers={
'db_xref': ['PF06970.10'],
'evalue': [1.2e-28],
'locus_tag': ['AAK73500.1'],
'database': ['31.0'],
}
),
SeqFeature(
FeatureLocation(18, 36, strand=1),
type="PFAM_domain",
id='PF01076.18',
qualifiers={
'db_xref': ['PF01076.18'],
'evalue': [6.9e-52],
'locus_tag': ['AAK73500.1_1'],
'database': ['31.0'],
}
),
]

expected_record.features = sorted(
expected_record.features,
key=lambda feature: (feature.location.start, -feature.location.end, bgc_util.FEATURE_ORDER.get(feature.type, 0))
)

data = \
"""# --- full sequence --- -------------- this domain ------------- hmm coord ali coord env coord
# target name accession tlen query name accession qlen E-value score bias # of c-Evalue i-Evalue score bias from to from to from to acc description of target
#------------------- ---------- ----- -------------------- ---------- ----- --------- ------ ----- --- --- --------- --------- ------ ----- ----- ----- ----- ----- ----- ----- ---- ---------------------
RepA_N PF06970.10 12 AAK73500.1 - 12 8.1e-29 99.3 0.0 1 1 7e-33 1.2e-28 98.8 0.0 2 12 2 12 2 12 0.99 Replication initiator protein A (RepA) N-terminus
Mob_Pre PF01076.18 12 AAK73500.1_1 - 12 6.9e-52 176.0 1.7 1 2 4.9e-55 6.9e-52 176.0 1.7 2 12 2 12 2 12 0.98 Plasmid recombination enzyme
Mob_Pre PF01076.18 12 AAK73500.1_1 - 12 6.9e-52 176.0 1.7 2 2 3 4.1e+03 -2.7 0.1 2 12 2 12 2 12 0.74 Plasmid recombination enzyme
"""
mock_searchio.parse.return_value = SearchIO.parse(StringIO(data), 'hmmscan3-domtab')

assert record.format('genbank') != expected_record.format('genbank')

# record fixing happening before running pfam annotation
bgc_util.fix_record_locus(record)
bgc_util.fix_duplicate_cds(record)
bgc_util.fix_dna_alphabet(record)

with TemporaryDirectory() as tmp_dir:
out_file = Path(tmp_dir).joinpath('hmmscan_out.pfam.domtbl.txt')
out_file.touch()
tmp_path_prefix = str(out_file.parent.joinpath(out_file.stem.split('.')[0]))
annotator = HmmscanPfamRecordAnnotator(record, tmp_path_prefix, db_path='/mock/db', clans_path='/mock/clans')

assert isinstance(annotator, HmmscanPfamRecordAnnotator)

annotator.annotate()

assert record.format('genbank') == expected_record.format('genbank')