Skip to content

Commit

Permalink
add blast max version
Browse files Browse the repository at this point in the history
refactor version matching, add test for blast max version, update readme
  • Loading branch information
SchwarzMarek committed Jun 27, 2019
1 parent 53fa57a commit 7efe469
Show file tree
Hide file tree
Showing 4 changed files with 78 additions and 110 deletions.
26 changes: 14 additions & 12 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,9 @@ For correct function the rboAnalyze needs a copy of RFAM database.

There are 2 ways:
1. Run rboAnalyzer with `--download_rfam` flag.
```shell
rboAnalyzer --download_rfam
```
This will download RFAM covariance models to default directory
(`[INSTALL_LOCATION]/rna_blast_analyze/3rd_party_source/rfam`).

Expand Down Expand Up @@ -230,25 +233,24 @@ rboAnalyzer -in MS1_BLAST_output -q MS1_query.fasta -db genomes.fasta.bdb -html
```

## Solving issues:
1. One or more records not found.

Reason: the blastdbcmd was not able to find sequence(s) with respective id(s) in provided database. This is due to inconsistency between the sequence accessions and the BLAST database.

The inconsistency may rise from
1. sequence is not in the database
Solution: Provide correct blast database (update current or create new with `genomes_from_blast`).
2. capturing regexp does not capture the accession number
Solution: Provide capturing regular expression (python 3 syntax) for capturing the sequence id from the fasta header (it must match the id to the BLAST database used)
3. the BLAST database was created without the `-parse_seqids` flag
Solution: Create new database from the sequences used to create new one, this time with `-parse_seqids` flag.
- __One or more records not found__
Reason: the blastdbcmd was not able to find sequence(s) with respective id(s) in provided database.
This is due to inconsistency between the sequence accessions and the BLAST database.
The inconsistency may rise from:
1. __sequence is not in the database__
Solution: Provide correct blast database (update current or create new with `genomes_from_blast`).
2. __capturing regexp does not capture the accession number__
Solution: Provide capturing regular expression (python 3 syntax) for capturing the sequence id from the fasta header (it must match the id to the BLAST database used)
3. __the BLAST database was created without the `-parse_seqids` flag__
Solution: Create new database from the sequences used to create new one, this time with `-parse_seqids` flag.

Another option is to call pipeline with `--skip_missing` flag.
This will skip the missing sequences.

Note that no HSP for the missing sequence will be included in pipeline output
and some prediction methods may be influenced by the missing sequence.

2. The `genomes_from_blast` failed
- __The `genomes_from_blast` failed__
The `genomes_from_blast` script has build in handling of failed downloads,
but by default it tries only 10 times. If you are on instable connection
you might get better results by setting the `--retry` to some larger number.
Expand Down
136 changes: 38 additions & 98 deletions rna_blast_analyze/BR_core/BA_verify.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,15 @@
import os
import re
import sys
import operator
from subprocess import check_output, STDOUT, CalledProcessError

from rna_blast_analyze.BR_core import cmalign
from rna_blast_analyze.BR_core.config import CONFIG
from rna_blast_analyze.BR_core.tools_versions import blast_minimal_version, locarna_minimal_version, \
infernal_minimal_version, vrna_minimal_version, clustalo_minimal_version, muscle_minimal_version, \
centroid_homfold_minimal_version, turbofold_minimal_version,\
mfold_minimal_version, method_required_tools
mfold_minimal_version, method_required_tools, blast_maximal_version

ml = logging.getLogger(__name__)

Expand Down Expand Up @@ -40,10 +41,10 @@ def verify_query_blast(blast, query):
return


def verify_blastdbcmd(minimal_version):
def verify_blastdbcmd(minimal_version, maximal_version):
"""verify if blastdbcmd is present in supported version
"""
msgversion = 'blastcmd not installed in required version, required version is {}.{}.{}'.format(*minimal_version)
msgversion = 'blastcmd not installed in required version, required version is between {}.{}.{} and {}.{}.{}'.format(*minimal_version + maximal_version)
msgpath = '{}blastcmd could not be located (not in PATH)'.format(CONFIG.blast_path)
msgsuccess = 'blastcmd is installed in required version'
try:
Expand All @@ -58,15 +59,12 @@ def verify_blastdbcmd(minimal_version):
if r:
ver = r.group().split('.')
ver = [int(i) for i in ver]
for v, minv in zip(ver, minimal_version):
if v > minv:
ml.info(msgsuccess)
return True
elif v < minv:
ml.warning(msgversion)
return False
ml.info(msgsuccess)
return True
bb_min = version_check(ver, minimal_version, msgsuccess, msgversion)
bb_max = version_check(ver, maximal_version, msgsuccess, msgversion, op=operator.le)
if bb_min and bb_max:
return True
else:
return False
else:
ml.warning(msgversion)
return False
Expand All @@ -88,17 +86,8 @@ def verify_locarna(minimal_version):
)
a = a.decode()
if a.startswith('LocARNA'):
r = re.finditer('[0-9]+', a)
for match, minv in zip(r, minimal_version):
v = int(match.group())
if v > minv:
ml.info(msgsuccess)
return True
elif v < minv:
ml.warning(msgversion)
return False
ml.info(msgsuccess)
return True
r = [int(m.group()) for m in re.finditer('[0-9]+', a)]
return version_check(r, minimal_version, msgsuccess, msgversion)
else:
ml.warning(msgversion)
return False
Expand All @@ -123,15 +112,7 @@ def verify_infernal(program, minimal_version):
r = re.search('(?<=# INFERNAL )[0-9.]+', a)
ver = r.group().split('.')
ver = [int(i) for i in ver]
for v, minv in zip(ver, minimal_version):
if v > minv:
ml.info(msgsuccess)
return True
elif v < minv:
ml.warning(msgversion)
return False
ml.info(msgsuccess)
return True
return version_check(ver, minimal_version, msgsuccess, msgversion)
else:
ml.warning(msgversion)
return False
Expand All @@ -153,16 +134,8 @@ def verify_viennarna_program(program, minimal_version):
)
a = a.decode()
if a.startswith(program):
ver = a.split()[1].split('.')
for v, minv in zip(ver, minimal_version):
if int(v) > minv:
ml.info(msgsuccess)
return True
elif int(v) < minv:
ml.warning(msgversion)
return False
ml.info(msgsuccess)
return True
ver = [int(i) for i in a.split()[1].split('.')]
return version_check(ver, minimal_version, msgsuccess, msgversion)
else:
ml.warning(msgversion)
return False
Expand Down Expand Up @@ -217,17 +190,8 @@ def verify_clustalo(minimal_version):
)
a = a.decode()
if a:
r = re.finditer('[0-9]+', a)
for match, minv in zip(r, minimal_version):
v = int(match.group())
if v > minv:
ml.info(msgsuccess)
return True
elif v < minv:
ml.warning(msgversion)
return False
ml.info(msgsuccess)
return True
r = [int(m.group()) for m in re.finditer('[0-9]+', a)]
return version_check(r, minimal_version, msgsuccess, msgversion)
else:
ml.warning(msgversion)
return False
Expand All @@ -249,17 +213,8 @@ def verify_muscle(minimal_version):
)
a = a.decode()
if a.startswith('MUSCLE'):
r = re.finditer('[0-9]+', a)
for match, minv in zip(r, minimal_version):
v = int(match.group())
if v > minv:
ml.info(msgsuccess)
return True
elif v < minv:
ml.warning(msgversion)
return False
ml.info(msgsuccess)
return True
r = [int(m.group()) for m in re.finditer('[0-9]+', a)]
return version_check(r, minimal_version, msgsuccess, msgversion)
else:
ml.warning(msgversion)
return False
Expand Down Expand Up @@ -288,17 +243,8 @@ def verify_centroid_homfold(minimal_version):
a = a.decode()
b = a.split()
if b[0] == 'CentroidHomfold':
r = re.finditer('[0-9]+', b[1])
for match, minv in zip(r, minimal_version):
v = int(match.group())
if v > minv:
ml.info(msgsuccess)
return True
elif v < minv:
ml.warning(msgversion)
return False
ml.info(msgsuccess)
return True
r = [int(m.group()) for m in re.finditer('[0-9]+', b[1])]
return version_check(r, minimal_version, msgsuccess, msgversion)
else:
ml.warning(msgversion)
return False
Expand Down Expand Up @@ -326,17 +272,8 @@ def verify_turbofold(minimal_version):
a = a.decode()
b = a.split()
if b[0] == 'TurboFold:':
r = re.finditer('[0-9]+', b[2])
for match, minv in zip(r, minimal_version):
v = int(match.group())
if v > minv:
ml.info(msgsuccess)
return True
elif v < minv:
ml.warning(msgversion)
return False
ml.info(msgsuccess)
return True
r = [int(m.group()) for m in re.finditer('[0-9]+', b[2])]
return version_check(r, minimal_version, msgsuccess, msgversion)
else:
ml.warning(msgversion)
return False
Expand Down Expand Up @@ -374,17 +311,8 @@ def verify_mfold(minimal_version):
a = a.decode()
b = a.split()
if b[0] == 'hybrid-ss-min':
r = re.finditer('[0-9]+', b[2])
for match, minv in zip(r, minimal_version):
v = int(match.group())
if v > minv:
ml.info(msgsuccess)
return True
elif v < minv:
ml.warning(msgversion)
return False
ml.info(msgsuccess)
return True
r = [int(m.group()) for m in re.finditer('[0-9]+', b[2])]
return version_check(r, minimal_version, msgsuccess, msgversion)
else:
ml.warning(msgversion)
return False
Expand All @@ -393,9 +321,21 @@ def verify_mfold(minimal_version):
return False


def version_check(r, minimal_version, msgsuccess, msgversion, op=operator.ge):
for v, minv in zip(r, minimal_version):
if op(v, minv):
ml.info(msgsuccess)
return True
elif op(minv, v):
ml.warning(msgversion)
return False
ml.info(msgsuccess)
return True


def check_3rd_party_tools():
installed = set()
if verify_blastdbcmd(blast_minimal_version):
if verify_blastdbcmd(blast_minimal_version, blast_maximal_version):
installed.add('blastdbcmd')

if verify_locarna(locarna_minimal_version):
Expand Down
1 change: 1 addition & 0 deletions rna_blast_analyze/BR_core/tools_versions.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
blast_minimal_version = [2, 6, 0]
blast_maximal_version = [2, 8, 0]
locarna_minimal_version = [1, 9, 2]
infernal_minimal_version = [1, 1, 2]
vrna_minimal_version = [2, 3, 5]
Expand Down
25 changes: 25 additions & 0 deletions test_func/test_verify_version.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import unittest
import operator
from rna_blast_analyze.BR_core.BA_verify import version_check


class TestBlastParser(unittest.TestCase):
def test_less(self):
self.assertTrue(
version_check([2, 1, 0], [2, 0, 0], '', '')
)

def test_greater(self):
self.assertTrue(
version_check([2, 0, 0], [2, 1, 0], '', '', op=operator.le)
)

def test_equal(self):
self.assertTrue(
version_check([1, 1, 1], [1, 1, 1], '', '')
)

def test_equal2(self):
self.assertTrue(
version_check([1, 1, 1], [1, 1, 1], '', '', op=operator.le)
)

0 comments on commit 7efe469

Please sign in to comment.