MAINT: update orient-seqs to use vsearch orient (#149)

bokulich-lab · Jun 5, 2023 · c7c6b4e · c7c6b4e
1 parent ed0344d
commit c7c6b4e
Show file tree

Hide file tree

Showing 3 changed files with 140 additions and 102 deletions.
diff --git a/rescript/orient.py b/rescript/orient.py
@@ -6,78 +6,79 @@
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
 
-import tempfile
 from q2_types.feature_data import DNAFASTAFormat, DNAIterator
-from warnings import warn
+from typing import List, Dict, Any
 
 from ._utilities import run_command
 
 
-def _warn_deprecated(value, default, name):
-    if value != default:
-        warn(
-            f"{name} is deprecated and will be removed from RESCRIPt 2023.5 "
-            "(RESCRIPt 2023.2 will be the last release with this parameter).",
-            FutureWarning
-        )
+def _add_optional_parameters(cmd: List[str], **kwargs: Dict[str, Any]) -> None:
+    """
+    Add optional parameters to a command.
+
+    Parameters
+    ----------
+    cmd : List[str]
+        The command to add optional parameters to.
+    kwargs : Dict[str, Any]
+        The optional parameters to add to the command.
+    """
+    for parameter, value in kwargs.items():
+        if isinstance(value, bool) and value is True:
+            cmd.append(f"--{parameter}")
+        elif isinstance(value, str) and value != "":
+            cmd.append(f"--{parameter}")
+            cmd.append(f"{value}")
 
 
-def orient_seqs(sequences: DNAFASTAFormat,
-                reference_sequences: DNAFASTAFormat = None,
-                perc_identity: float = 0.9,
-                query_cov: float = 0.9,
-                threads: int = 1,
-                left_justify: bool = False,
-                ) -> (DNAFASTAFormat, DNAFASTAFormat):
-    matched_temp, notmatched = DNAFASTAFormat(), DNAFASTAFormat()
+def orient_seqs(
+    sequences: DNAFASTAFormat,
+    reference_sequences: DNAFASTAFormat = None,
+    threads: int = 1,
+    dbmask: str = None,
+    relabel: str = None,
+    relabel_keep: bool = None,
+    relabel_md5: bool = None,
+    relabel_self: bool = None,
+    relabel_sha1: bool = None,
+    sizein: bool = None,
+    sizeout: bool = None,
+) -> (DNAFASTAFormat, DNAFASTAFormat):
+    oriented, notmatched = DNAFASTAFormat(), DNAFASTAFormat()
     if reference_sequences is not None:
-        # use vsearch to search query seqs against reference database
-        # report orientation of query seqs relative to reference seqs.
-        with tempfile.NamedTemporaryFile() as out:
-            # note: qmask is disabled as DNAFASTAFormat requires all output
-            # seqs to be uppercase. Could loop through output seqs to convert
-            # to upper but which is faster: disabling masking or looping
-            # through with skbio?
-            cmd = ['vsearch',
-                   '--usearch_global', str(sequences),
-                   '--matched', str(matched_temp),
-                   '--notmatched', str(notmatched),
-                   '--db', str(reference_sequences),
-                   '--id', str(perc_identity),
-                   '--maxaccepts', '1',
-                   '--strand', 'both',
-                   '--qmask', 'none',
-                   '--query_cov', str(query_cov),
-                   '--threads', str(threads),
-                   '--userfields', 'qstrand',
-                   '--userout', out.name]
-            if left_justify:
-                cmd.append('--leftjust')
-            run_command(cmd)
-            with open(out.name, 'r') as orient:
-                orientations = [line.strip() for line in orient]
+        # use vsearch to orient seqs against reference database
+        # note: qmask is disabled as DNAFASTAFormat requires all output
+        # seqs to be uppercase. Could loop through output seqs to convert
+        # to upper but which is faster: disabling masking or looping
+        # through with skbio?
+        cmd = [
+            'vsearch',
+            '--orient', str(sequences),
+            '--fastaout', str(oriented),
+            '--notmatched', str(notmatched),
+            '--db', str(reference_sequences),
+            '--qmask', 'none',
+            '--threads', str(threads),
+        ]
+
+        _add_optional_parameters(
+            cmd,
+            dbmask=dbmask,
+            relabel=relabel,
+            relabel_keep=relabel_keep,
+            relabel_md5=relabel_md5,
+            relabel_self=relabel_self,
+            relabel_sha1=relabel_sha1,
+            sizein=sizein,
+            sizeout=sizeout,
+        )
 
-            # Warn about parameters that will be deprecated
-            _warn_deprecated(perc_identity, 0.9, 'perc_identity')
-            _warn_deprecated(query_cov, 0.9, 'query_cov')
-            _warn_deprecated(left_justify, False, 'left_justify')
+        run_command(cmd)
 
-        # if any query seqs are in reverse orientation, reverse complement
-        if '-' in orientations:
-            matched = DNAFASTAFormat()
-            with matched.open() as out_fasta:
-                for seq, orientation in zip(
-                        matched_temp.view(DNAIterator), orientations):
-                    if orientation == '+':
-                        seq.write(out_fasta)
-                    elif orientation == '-':
-                        seq.reverse_complement().write(out_fasta)
-        else:
-            matched = matched_temp
     else:
-        matched = DNAFASTAFormat()
-        with matched.open() as out_fasta:
+        oriented = DNAFASTAFormat()
+        with oriented.open() as out_fasta:
             for seq in sequences.view(DNAIterator):
                 seq.reverse_complement().write(out_fasta)
 
-    return matched, notmatched
+    return oriented, notmatched
diff --git a/rescript/plugin_setup.py b/rescript/plugin_setup.py
@@ -278,8 +278,14 @@
     'threads': Int % Range(1, 256),
     'perc_identity': Float % Range(0, 1, inclusive_start=False,
                                    inclusive_end=True),
-    'left_justify': Bool,
-    'query_cov': Float % Range(0.0, 1.0, inclusive_end=True),
+    'dbmask': Str % Choices(['none', 'dust', 'soft']),
+    'relabel': Str,
+    'relabel_keep': Bool,
+    'relabel_md5': Bool,
+    'relabel_self': Bool,
+    'relabel_sha1': Bool,
+    'sizein': Bool,
+    'sizeout': Bool,
 }
 
 VSEARCH_PARAM_DESCRIPTIONS = {
@@ -289,9 +295,25 @@
     'perc_identity': 'The percent identity at which clustering should be '
                      'performed. This parameter maps to vsearch\'s --id '
                      'parameter.',
-    'left_justify': 'Reject match if the pairwise alignment begins with gaps',
-    'query_cov': 'Reject match if query alignment coverage per high-scoring '
-                 'pair is lower.',
+    'dbmask': 'Mask regions in the target database sequences using the '
+              'dust method, or do not mask (none). When using soft '
+              'masking, search commands become case sensitive.',
+    'relabel': 'Relabel sequences using the prefix string and a ticker '
+               '(1, 2, 3, etc.) to construct the new headers. Use --sizeout'
+               ' to conserve the abundance annotations.',
+    'relabel_keep': 'When relabeling, keep the original identifier in the '
+                    'header after a space.',
+    'relabel_md5': 'When relabeling, use the MD5 digest of the sequence as '
+                   'the new identifier. Use --sizeout to conserve the '
+                   'abundance annotations.',
+    'relabel_self': 'Relabel sequences using the sequence itself as a label.',
+    'relabel_sha1': 'When relabeling, use the SHA1 digest of the sequence as '
+                    'the new identifier. The probability of a collision is '
+                    'smaller than the MD5 algorithm.',
+    'sizein': 'In de novo mode, abundance annotations (pattern '
+              '`[>;]size=integer[;]`) present in sequence headers '
+              'are taken into account.',
+    'sizeout': 'Add abundance annotations to the output FASTA files.',
 }
 
 
@@ -557,7 +579,17 @@
     function=orient_seqs,
     inputs={'sequences': FeatureData[Sequence],
             'reference_sequences': FeatureData[Sequence]},
-    parameters=VSEARCH_PARAMS,
+    parameters={
+        'threads': VSEARCH_PARAMS['threads'],
+        'dbmask': VSEARCH_PARAMS['dbmask'],
+        'relabel': VSEARCH_PARAMS['relabel'],
+        'relabel_keep': VSEARCH_PARAMS['relabel_keep'],
+        'relabel_md5': VSEARCH_PARAMS['relabel_md5'],
+        'relabel_self': VSEARCH_PARAMS['relabel_self'],
+        'relabel_sha1': VSEARCH_PARAMS['relabel_sha1'],
+        'sizein': VSEARCH_PARAMS['sizein'],
+        'sizeout': VSEARCH_PARAMS['sizeout'],
+    },
     outputs=[('oriented_seqs', FeatureData[Sequence]),
              ('unmatched_seqs', FeatureData[Sequence])],
     input_descriptions={
@@ -567,7 +599,17 @@
                                 'will be reverse complemented and all '
                                 'parameters will be ignored.'
                                 )},
-    parameter_descriptions=VSEARCH_PARAM_DESCRIPTIONS,
+    parameter_descriptions={
+        'threads': VSEARCH_PARAM_DESCRIPTIONS['threads'],
+        'dbmask': VSEARCH_PARAM_DESCRIPTIONS['dbmask'],
+        'relabel': VSEARCH_PARAM_DESCRIPTIONS['relabel'],
+        'relabel_keep': VSEARCH_PARAM_DESCRIPTIONS['relabel_keep'],
+        'relabel_md5': VSEARCH_PARAM_DESCRIPTIONS['relabel_md5'],
+        'relabel_self': VSEARCH_PARAM_DESCRIPTIONS['relabel_self'],
+        'relabel_sha1': VSEARCH_PARAM_DESCRIPTIONS['relabel_sha1'],
+        'sizein': VSEARCH_PARAM_DESCRIPTIONS['sizein'],
+        'sizeout': VSEARCH_PARAM_DESCRIPTIONS['sizeout'],
+    },
     output_descriptions={
         'oriented_seqs': 'Query sequences in same orientation as top matching '
                          'reference sequence.',
@@ -582,8 +624,9 @@
         'sequences in either orientation. Alternatively, if no reference '
         'sequences are provided as input, all input sequences will be '
         'reverse-complemented. In this case, no alignment is performed, '
-        'and all alignment parameters (`perc_identity`, `query_cov`, '
-        '`threads`, and `left_justify`) are ignored.'
+        'and all alignment parameters (`dbmask`, `relabel`, '
+        '`relabel_keep`, `relabel_md5`, `relabel_self`, `relabel_sha1`, '
+        '`sizein`, `sizeout` and `threads`) are ignored.'
     ),
     citations=[citations['rognes2016vsearch']]
 )

diff --git a/rescript/tests/test_orient.py b/rescript/tests/test_orient.py
@@ -11,6 +11,7 @@
 from qiime2.plugin.testing import TestPluginBase
 from q2_types.feature_data import DNAIterator
 from qiime2.plugins import rescript
+from rescript.orient import _add_optional_parameters
 
 import_data = qiime2.Artifact.import_data
 
@@ -66,38 +67,6 @@ def test_reorient_default(self):
         exp_unmatched_ids = {'JUNK', 'MOREJUNK'}
         self.assertEqual(unmatched_ids, exp_unmatched_ids)
 
-    def test_reorient_exact(self):
-        # just check correct IDs are returned; default test checks orientations
-        reoriented, unmatched, = rescript.actions.orient_seqs(
-            sequences=self.seqs, reference_sequences=self.ref,
-            perc_identity=1.0)
-        unmatched_ids = {
-            seq.metadata['id'] for seq in unmatched.view(DNAIterator)}
-        # seq A1 A2 and A3 were modified in self.seqs to have sundry mismatches
-        exp_unmatched_ids = {'JUNK', 'MOREJUNK', 'A1', 'A2', 'A3'}
-        self.assertEqual(unmatched_ids, exp_unmatched_ids)
-        reoriented_ids = {
-            seq.metadata['id'] for seq in reoriented.view(DNAIterator)}
-        exp_reoriented_ids = {seq.metadata['id'] for seq in
-                              self.ref.view(DNAIterator)} - exp_unmatched_ids
-        self.assertEqual(reoriented_ids, exp_reoriented_ids)
-
-    def test_reorient_left_justify(self):
-        # just check correct IDs are returned; default test checks orientations
-        reoriented, unmatched, = rescript.actions.orient_seqs(
-            sequences=self.seqs, reference_sequences=self.ref,
-            left_justify=True)
-        unmatched_ids = {
-            seq.metadata['id'] for seq in unmatched.view(DNAIterator)}
-        # seq A2 was modified in self.seqs to have gaps at the 5' end
-        exp_unmatched_ids = {'JUNK', 'MOREJUNK', 'A2'}
-        self.assertEqual(unmatched_ids, exp_unmatched_ids)
-        reoriented_ids = {
-            seq.metadata['id'] for seq in reoriented.view(DNAIterator)}
-        exp_reoriented_ids = {seq.metadata['id'] for seq in
-                              self.ref.view(DNAIterator)} - exp_unmatched_ids
-        self.assertEqual(reoriented_ids, exp_reoriented_ids)
-
     def test_reorient_no_ref(self):
         reoriented, unmatched = rescript.actions.orient_seqs(
             sequences=self.seqs, reference_sequences=None,
@@ -110,3 +79,28 @@ def test_reorient_no_ref(self):
         for exp, test in zip(*(exp_seqs, test_seqs)):
             self.assertEqual(str(exp), str(test))
             self.assertEqual(exp.metadata['id'], test.metadata['id'])
+
+    def test_add_optional_parameters(self):
+        expected = [
+            "--dbmask", "dust",
+            "--relabel", "new_id",
+            "--relabel_keep",
+            "--relabel_md5",
+            "--relabel_self",
+            "--relabel_sha1",
+            "--sizein",
+            "--sizeout",
+        ]
+        result = []
+        _add_optional_parameters(
+            result,
+            dbmask="dust",
+            relabel="new_id",
+            relabel_keep=True,
+            relabel_md5=True,
+            relabel_self=True,
+            relabel_sha1=True,
+            sizein=True,
+            sizeout=True,
+        )
+        self.assertEqual(result, expected)