Skip to content

Commit

Permalink
MAINT: update orient-seqs to use vsearch orient (#149)
Browse files Browse the repository at this point in the history
  • Loading branch information
mirand863 authored Jun 5, 2023
1 parent ed0344d commit c7c6b4e
Show file tree
Hide file tree
Showing 3 changed files with 140 additions and 102 deletions.
123 changes: 62 additions & 61 deletions rescript/orient.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,78 +6,79 @@
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------

import tempfile
from q2_types.feature_data import DNAFASTAFormat, DNAIterator
from warnings import warn
from typing import List, Dict, Any

from ._utilities import run_command


def _warn_deprecated(value, default, name):
if value != default:
warn(
f"{name} is deprecated and will be removed from RESCRIPt 2023.5 "
"(RESCRIPt 2023.2 will be the last release with this parameter).",
FutureWarning
)
def _add_optional_parameters(cmd: List[str], **kwargs: Dict[str, Any]) -> None:
"""
Add optional parameters to a command.
Parameters
----------
cmd : List[str]
The command to add optional parameters to.
kwargs : Dict[str, Any]
The optional parameters to add to the command.
"""
for parameter, value in kwargs.items():
if isinstance(value, bool) and value is True:
cmd.append(f"--{parameter}")
elif isinstance(value, str) and value != "":
cmd.append(f"--{parameter}")
cmd.append(f"{value}")


def orient_seqs(sequences: DNAFASTAFormat,
reference_sequences: DNAFASTAFormat = None,
perc_identity: float = 0.9,
query_cov: float = 0.9,
threads: int = 1,
left_justify: bool = False,
) -> (DNAFASTAFormat, DNAFASTAFormat):
matched_temp, notmatched = DNAFASTAFormat(), DNAFASTAFormat()
def orient_seqs(
sequences: DNAFASTAFormat,
reference_sequences: DNAFASTAFormat = None,
threads: int = 1,
dbmask: str = None,
relabel: str = None,
relabel_keep: bool = None,
relabel_md5: bool = None,
relabel_self: bool = None,
relabel_sha1: bool = None,
sizein: bool = None,
sizeout: bool = None,
) -> (DNAFASTAFormat, DNAFASTAFormat):
oriented, notmatched = DNAFASTAFormat(), DNAFASTAFormat()
if reference_sequences is not None:
# use vsearch to search query seqs against reference database
# report orientation of query seqs relative to reference seqs.
with tempfile.NamedTemporaryFile() as out:
# note: qmask is disabled as DNAFASTAFormat requires all output
# seqs to be uppercase. Could loop through output seqs to convert
# to upper but which is faster: disabling masking or looping
# through with skbio?
cmd = ['vsearch',
'--usearch_global', str(sequences),
'--matched', str(matched_temp),
'--notmatched', str(notmatched),
'--db', str(reference_sequences),
'--id', str(perc_identity),
'--maxaccepts', '1',
'--strand', 'both',
'--qmask', 'none',
'--query_cov', str(query_cov),
'--threads', str(threads),
'--userfields', 'qstrand',
'--userout', out.name]
if left_justify:
cmd.append('--leftjust')
run_command(cmd)
with open(out.name, 'r') as orient:
orientations = [line.strip() for line in orient]
# use vsearch to orient seqs against reference database
# note: qmask is disabled as DNAFASTAFormat requires all output
# seqs to be uppercase. Could loop through output seqs to convert
# to upper but which is faster: disabling masking or looping
# through with skbio?
cmd = [
'vsearch',
'--orient', str(sequences),
'--fastaout', str(oriented),
'--notmatched', str(notmatched),
'--db', str(reference_sequences),
'--qmask', 'none',
'--threads', str(threads),
]

_add_optional_parameters(
cmd,
dbmask=dbmask,
relabel=relabel,
relabel_keep=relabel_keep,
relabel_md5=relabel_md5,
relabel_self=relabel_self,
relabel_sha1=relabel_sha1,
sizein=sizein,
sizeout=sizeout,
)

# Warn about parameters that will be deprecated
_warn_deprecated(perc_identity, 0.9, 'perc_identity')
_warn_deprecated(query_cov, 0.9, 'query_cov')
_warn_deprecated(left_justify, False, 'left_justify')
run_command(cmd)

# if any query seqs are in reverse orientation, reverse complement
if '-' in orientations:
matched = DNAFASTAFormat()
with matched.open() as out_fasta:
for seq, orientation in zip(
matched_temp.view(DNAIterator), orientations):
if orientation == '+':
seq.write(out_fasta)
elif orientation == '-':
seq.reverse_complement().write(out_fasta)
else:
matched = matched_temp
else:
matched = DNAFASTAFormat()
with matched.open() as out_fasta:
oriented = DNAFASTAFormat()
with oriented.open() as out_fasta:
for seq in sequences.view(DNAIterator):
seq.reverse_complement().write(out_fasta)

return matched, notmatched
return oriented, notmatched
61 changes: 52 additions & 9 deletions rescript/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,8 +278,14 @@
'threads': Int % Range(1, 256),
'perc_identity': Float % Range(0, 1, inclusive_start=False,
inclusive_end=True),
'left_justify': Bool,
'query_cov': Float % Range(0.0, 1.0, inclusive_end=True),
'dbmask': Str % Choices(['none', 'dust', 'soft']),
'relabel': Str,
'relabel_keep': Bool,
'relabel_md5': Bool,
'relabel_self': Bool,
'relabel_sha1': Bool,
'sizein': Bool,
'sizeout': Bool,
}

VSEARCH_PARAM_DESCRIPTIONS = {
Expand All @@ -289,9 +295,25 @@
'perc_identity': 'The percent identity at which clustering should be '
'performed. This parameter maps to vsearch\'s --id '
'parameter.',
'left_justify': 'Reject match if the pairwise alignment begins with gaps',
'query_cov': 'Reject match if query alignment coverage per high-scoring '
'pair is lower.',
'dbmask': 'Mask regions in the target database sequences using the '
'dust method, or do not mask (none). When using soft '
'masking, search commands become case sensitive.',
'relabel': 'Relabel sequences using the prefix string and a ticker '
'(1, 2, 3, etc.) to construct the new headers. Use --sizeout'
' to conserve the abundance annotations.',
'relabel_keep': 'When relabeling, keep the original identifier in the '
'header after a space.',
'relabel_md5': 'When relabeling, use the MD5 digest of the sequence as '
'the new identifier. Use --sizeout to conserve the '
'abundance annotations.',
'relabel_self': 'Relabel sequences using the sequence itself as a label.',
'relabel_sha1': 'When relabeling, use the SHA1 digest of the sequence as '
'the new identifier. The probability of a collision is '
'smaller than the MD5 algorithm.',
'sizein': 'In de novo mode, abundance annotations (pattern '
'`[>;]size=integer[;]`) present in sequence headers '
'are taken into account.',
'sizeout': 'Add abundance annotations to the output FASTA files.',
}


Expand Down Expand Up @@ -557,7 +579,17 @@
function=orient_seqs,
inputs={'sequences': FeatureData[Sequence],
'reference_sequences': FeatureData[Sequence]},
parameters=VSEARCH_PARAMS,
parameters={
'threads': VSEARCH_PARAMS['threads'],
'dbmask': VSEARCH_PARAMS['dbmask'],
'relabel': VSEARCH_PARAMS['relabel'],
'relabel_keep': VSEARCH_PARAMS['relabel_keep'],
'relabel_md5': VSEARCH_PARAMS['relabel_md5'],
'relabel_self': VSEARCH_PARAMS['relabel_self'],
'relabel_sha1': VSEARCH_PARAMS['relabel_sha1'],
'sizein': VSEARCH_PARAMS['sizein'],
'sizeout': VSEARCH_PARAMS['sizeout'],
},
outputs=[('oriented_seqs', FeatureData[Sequence]),
('unmatched_seqs', FeatureData[Sequence])],
input_descriptions={
Expand All @@ -567,7 +599,17 @@
'will be reverse complemented and all '
'parameters will be ignored.'
)},
parameter_descriptions=VSEARCH_PARAM_DESCRIPTIONS,
parameter_descriptions={
'threads': VSEARCH_PARAM_DESCRIPTIONS['threads'],
'dbmask': VSEARCH_PARAM_DESCRIPTIONS['dbmask'],
'relabel': VSEARCH_PARAM_DESCRIPTIONS['relabel'],
'relabel_keep': VSEARCH_PARAM_DESCRIPTIONS['relabel_keep'],
'relabel_md5': VSEARCH_PARAM_DESCRIPTIONS['relabel_md5'],
'relabel_self': VSEARCH_PARAM_DESCRIPTIONS['relabel_self'],
'relabel_sha1': VSEARCH_PARAM_DESCRIPTIONS['relabel_sha1'],
'sizein': VSEARCH_PARAM_DESCRIPTIONS['sizein'],
'sizeout': VSEARCH_PARAM_DESCRIPTIONS['sizeout'],
},
output_descriptions={
'oriented_seqs': 'Query sequences in same orientation as top matching '
'reference sequence.',
Expand All @@ -582,8 +624,9 @@
'sequences in either orientation. Alternatively, if no reference '
'sequences are provided as input, all input sequences will be '
'reverse-complemented. In this case, no alignment is performed, '
'and all alignment parameters (`perc_identity`, `query_cov`, '
'`threads`, and `left_justify`) are ignored.'
'and all alignment parameters (`dbmask`, `relabel`, '
'`relabel_keep`, `relabel_md5`, `relabel_self`, `relabel_sha1`, '
'`sizein`, `sizeout` and `threads`) are ignored.'
),
citations=[citations['rognes2016vsearch']]
)
Expand Down
58 changes: 26 additions & 32 deletions rescript/tests/test_orient.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from qiime2.plugin.testing import TestPluginBase
from q2_types.feature_data import DNAIterator
from qiime2.plugins import rescript
from rescript.orient import _add_optional_parameters

import_data = qiime2.Artifact.import_data

Expand Down Expand Up @@ -66,38 +67,6 @@ def test_reorient_default(self):
exp_unmatched_ids = {'JUNK', 'MOREJUNK'}
self.assertEqual(unmatched_ids, exp_unmatched_ids)

def test_reorient_exact(self):
# just check correct IDs are returned; default test checks orientations
reoriented, unmatched, = rescript.actions.orient_seqs(
sequences=self.seqs, reference_sequences=self.ref,
perc_identity=1.0)
unmatched_ids = {
seq.metadata['id'] for seq in unmatched.view(DNAIterator)}
# seq A1 A2 and A3 were modified in self.seqs to have sundry mismatches
exp_unmatched_ids = {'JUNK', 'MOREJUNK', 'A1', 'A2', 'A3'}
self.assertEqual(unmatched_ids, exp_unmatched_ids)
reoriented_ids = {
seq.metadata['id'] for seq in reoriented.view(DNAIterator)}
exp_reoriented_ids = {seq.metadata['id'] for seq in
self.ref.view(DNAIterator)} - exp_unmatched_ids
self.assertEqual(reoriented_ids, exp_reoriented_ids)

def test_reorient_left_justify(self):
# just check correct IDs are returned; default test checks orientations
reoriented, unmatched, = rescript.actions.orient_seqs(
sequences=self.seqs, reference_sequences=self.ref,
left_justify=True)
unmatched_ids = {
seq.metadata['id'] for seq in unmatched.view(DNAIterator)}
# seq A2 was modified in self.seqs to have gaps at the 5' end
exp_unmatched_ids = {'JUNK', 'MOREJUNK', 'A2'}
self.assertEqual(unmatched_ids, exp_unmatched_ids)
reoriented_ids = {
seq.metadata['id'] for seq in reoriented.view(DNAIterator)}
exp_reoriented_ids = {seq.metadata['id'] for seq in
self.ref.view(DNAIterator)} - exp_unmatched_ids
self.assertEqual(reoriented_ids, exp_reoriented_ids)

def test_reorient_no_ref(self):
reoriented, unmatched = rescript.actions.orient_seqs(
sequences=self.seqs, reference_sequences=None,
Expand All @@ -110,3 +79,28 @@ def test_reorient_no_ref(self):
for exp, test in zip(*(exp_seqs, test_seqs)):
self.assertEqual(str(exp), str(test))
self.assertEqual(exp.metadata['id'], test.metadata['id'])

def test_add_optional_parameters(self):
expected = [
"--dbmask", "dust",
"--relabel", "new_id",
"--relabel_keep",
"--relabel_md5",
"--relabel_self",
"--relabel_sha1",
"--sizein",
"--sizeout",
]
result = []
_add_optional_parameters(
result,
dbmask="dust",
relabel="new_id",
relabel_keep=True,
relabel_md5=True,
relabel_self=True,
relabel_sha1=True,
sizein=True,
sizeout=True,
)
self.assertEqual(result, expected)

0 comments on commit c7c6b4e

Please sign in to comment.