From c7c6b4e8b0d7e2f1654806e733de5326757b442f Mon Sep 17 00:00:00 2001 From: mirand863 Date: Mon, 5 Jun 2023 08:45:42 +0200 Subject: [PATCH] MAINT: update `orient-seqs` to use vsearch `orient` (#149) --- rescript/orient.py | 123 +++++++++++++++++----------------- rescript/plugin_setup.py | 61 ++++++++++++++--- rescript/tests/test_orient.py | 58 +++++++--------- 3 files changed, 140 insertions(+), 102 deletions(-) diff --git a/rescript/orient.py b/rescript/orient.py index 95ced99..287b5a3 100644 --- a/rescript/orient.py +++ b/rescript/orient.py @@ -6,78 +6,79 @@ # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- -import tempfile from q2_types.feature_data import DNAFASTAFormat, DNAIterator -from warnings import warn +from typing import List, Dict, Any from ._utilities import run_command -def _warn_deprecated(value, default, name): - if value != default: - warn( - f"{name} is deprecated and will be removed from RESCRIPt 2023.5 " - "(RESCRIPt 2023.2 will be the last release with this parameter).", - FutureWarning - ) +def _add_optional_parameters(cmd: List[str], **kwargs: Dict[str, Any]) -> None: + """ + Add optional parameters to a command. + + Parameters + ---------- + cmd : List[str] + The command to add optional parameters to. + kwargs : Dict[str, Any] + The optional parameters to add to the command. + """ + for parameter, value in kwargs.items(): + if isinstance(value, bool) and value is True: + cmd.append(f"--{parameter}") + elif isinstance(value, str) and value != "": + cmd.append(f"--{parameter}") + cmd.append(f"{value}") -def orient_seqs(sequences: DNAFASTAFormat, - reference_sequences: DNAFASTAFormat = None, - perc_identity: float = 0.9, - query_cov: float = 0.9, - threads: int = 1, - left_justify: bool = False, - ) -> (DNAFASTAFormat, DNAFASTAFormat): - matched_temp, notmatched = DNAFASTAFormat(), DNAFASTAFormat() +def orient_seqs( + sequences: DNAFASTAFormat, + reference_sequences: DNAFASTAFormat = None, + threads: int = 1, + dbmask: str = None, + relabel: str = None, + relabel_keep: bool = None, + relabel_md5: bool = None, + relabel_self: bool = None, + relabel_sha1: bool = None, + sizein: bool = None, + sizeout: bool = None, +) -> (DNAFASTAFormat, DNAFASTAFormat): + oriented, notmatched = DNAFASTAFormat(), DNAFASTAFormat() if reference_sequences is not None: - # use vsearch to search query seqs against reference database - # report orientation of query seqs relative to reference seqs. - with tempfile.NamedTemporaryFile() as out: - # note: qmask is disabled as DNAFASTAFormat requires all output - # seqs to be uppercase. Could loop through output seqs to convert - # to upper but which is faster: disabling masking or looping - # through with skbio? - cmd = ['vsearch', - '--usearch_global', str(sequences), - '--matched', str(matched_temp), - '--notmatched', str(notmatched), - '--db', str(reference_sequences), - '--id', str(perc_identity), - '--maxaccepts', '1', - '--strand', 'both', - '--qmask', 'none', - '--query_cov', str(query_cov), - '--threads', str(threads), - '--userfields', 'qstrand', - '--userout', out.name] - if left_justify: - cmd.append('--leftjust') - run_command(cmd) - with open(out.name, 'r') as orient: - orientations = [line.strip() for line in orient] + # use vsearch to orient seqs against reference database + # note: qmask is disabled as DNAFASTAFormat requires all output + # seqs to be uppercase. Could loop through output seqs to convert + # to upper but which is faster: disabling masking or looping + # through with skbio? + cmd = [ + 'vsearch', + '--orient', str(sequences), + '--fastaout', str(oriented), + '--notmatched', str(notmatched), + '--db', str(reference_sequences), + '--qmask', 'none', + '--threads', str(threads), + ] + + _add_optional_parameters( + cmd, + dbmask=dbmask, + relabel=relabel, + relabel_keep=relabel_keep, + relabel_md5=relabel_md5, + relabel_self=relabel_self, + relabel_sha1=relabel_sha1, + sizein=sizein, + sizeout=sizeout, + ) - # Warn about parameters that will be deprecated - _warn_deprecated(perc_identity, 0.9, 'perc_identity') - _warn_deprecated(query_cov, 0.9, 'query_cov') - _warn_deprecated(left_justify, False, 'left_justify') + run_command(cmd) - # if any query seqs are in reverse orientation, reverse complement - if '-' in orientations: - matched = DNAFASTAFormat() - with matched.open() as out_fasta: - for seq, orientation in zip( - matched_temp.view(DNAIterator), orientations): - if orientation == '+': - seq.write(out_fasta) - elif orientation == '-': - seq.reverse_complement().write(out_fasta) - else: - matched = matched_temp else: - matched = DNAFASTAFormat() - with matched.open() as out_fasta: + oriented = DNAFASTAFormat() + with oriented.open() as out_fasta: for seq in sequences.view(DNAIterator): seq.reverse_complement().write(out_fasta) - return matched, notmatched + return oriented, notmatched diff --git a/rescript/plugin_setup.py b/rescript/plugin_setup.py index 1d43d4e..7346c9d 100644 --- a/rescript/plugin_setup.py +++ b/rescript/plugin_setup.py @@ -278,8 +278,14 @@ 'threads': Int % Range(1, 256), 'perc_identity': Float % Range(0, 1, inclusive_start=False, inclusive_end=True), - 'left_justify': Bool, - 'query_cov': Float % Range(0.0, 1.0, inclusive_end=True), + 'dbmask': Str % Choices(['none', 'dust', 'soft']), + 'relabel': Str, + 'relabel_keep': Bool, + 'relabel_md5': Bool, + 'relabel_self': Bool, + 'relabel_sha1': Bool, + 'sizein': Bool, + 'sizeout': Bool, } VSEARCH_PARAM_DESCRIPTIONS = { @@ -289,9 +295,25 @@ 'perc_identity': 'The percent identity at which clustering should be ' 'performed. This parameter maps to vsearch\'s --id ' 'parameter.', - 'left_justify': 'Reject match if the pairwise alignment begins with gaps', - 'query_cov': 'Reject match if query alignment coverage per high-scoring ' - 'pair is lower.', + 'dbmask': 'Mask regions in the target database sequences using the ' + 'dust method, or do not mask (none). When using soft ' + 'masking, search commands become case sensitive.', + 'relabel': 'Relabel sequences using the prefix string and a ticker ' + '(1, 2, 3, etc.) to construct the new headers. Use --sizeout' + ' to conserve the abundance annotations.', + 'relabel_keep': 'When relabeling, keep the original identifier in the ' + 'header after a space.', + 'relabel_md5': 'When relabeling, use the MD5 digest of the sequence as ' + 'the new identifier. Use --sizeout to conserve the ' + 'abundance annotations.', + 'relabel_self': 'Relabel sequences using the sequence itself as a label.', + 'relabel_sha1': 'When relabeling, use the SHA1 digest of the sequence as ' + 'the new identifier. The probability of a collision is ' + 'smaller than the MD5 algorithm.', + 'sizein': 'In de novo mode, abundance annotations (pattern ' + '`[>;]size=integer[;]`) present in sequence headers ' + 'are taken into account.', + 'sizeout': 'Add abundance annotations to the output FASTA files.', } @@ -557,7 +579,17 @@ function=orient_seqs, inputs={'sequences': FeatureData[Sequence], 'reference_sequences': FeatureData[Sequence]}, - parameters=VSEARCH_PARAMS, + parameters={ + 'threads': VSEARCH_PARAMS['threads'], + 'dbmask': VSEARCH_PARAMS['dbmask'], + 'relabel': VSEARCH_PARAMS['relabel'], + 'relabel_keep': VSEARCH_PARAMS['relabel_keep'], + 'relabel_md5': VSEARCH_PARAMS['relabel_md5'], + 'relabel_self': VSEARCH_PARAMS['relabel_self'], + 'relabel_sha1': VSEARCH_PARAMS['relabel_sha1'], + 'sizein': VSEARCH_PARAMS['sizein'], + 'sizeout': VSEARCH_PARAMS['sizeout'], + }, outputs=[('oriented_seqs', FeatureData[Sequence]), ('unmatched_seqs', FeatureData[Sequence])], input_descriptions={ @@ -567,7 +599,17 @@ 'will be reverse complemented and all ' 'parameters will be ignored.' )}, - parameter_descriptions=VSEARCH_PARAM_DESCRIPTIONS, + parameter_descriptions={ + 'threads': VSEARCH_PARAM_DESCRIPTIONS['threads'], + 'dbmask': VSEARCH_PARAM_DESCRIPTIONS['dbmask'], + 'relabel': VSEARCH_PARAM_DESCRIPTIONS['relabel'], + 'relabel_keep': VSEARCH_PARAM_DESCRIPTIONS['relabel_keep'], + 'relabel_md5': VSEARCH_PARAM_DESCRIPTIONS['relabel_md5'], + 'relabel_self': VSEARCH_PARAM_DESCRIPTIONS['relabel_self'], + 'relabel_sha1': VSEARCH_PARAM_DESCRIPTIONS['relabel_sha1'], + 'sizein': VSEARCH_PARAM_DESCRIPTIONS['sizein'], + 'sizeout': VSEARCH_PARAM_DESCRIPTIONS['sizeout'], + }, output_descriptions={ 'oriented_seqs': 'Query sequences in same orientation as top matching ' 'reference sequence.', @@ -582,8 +624,9 @@ 'sequences in either orientation. Alternatively, if no reference ' 'sequences are provided as input, all input sequences will be ' 'reverse-complemented. In this case, no alignment is performed, ' - 'and all alignment parameters (`perc_identity`, `query_cov`, ' - '`threads`, and `left_justify`) are ignored.' + 'and all alignment parameters (`dbmask`, `relabel`, ' + '`relabel_keep`, `relabel_md5`, `relabel_self`, `relabel_sha1`, ' + '`sizein`, `sizeout` and `threads`) are ignored.' ), citations=[citations['rognes2016vsearch']] ) diff --git a/rescript/tests/test_orient.py b/rescript/tests/test_orient.py index 25a6add..7812636 100644 --- a/rescript/tests/test_orient.py +++ b/rescript/tests/test_orient.py @@ -11,6 +11,7 @@ from qiime2.plugin.testing import TestPluginBase from q2_types.feature_data import DNAIterator from qiime2.plugins import rescript +from rescript.orient import _add_optional_parameters import_data = qiime2.Artifact.import_data @@ -66,38 +67,6 @@ def test_reorient_default(self): exp_unmatched_ids = {'JUNK', 'MOREJUNK'} self.assertEqual(unmatched_ids, exp_unmatched_ids) - def test_reorient_exact(self): - # just check correct IDs are returned; default test checks orientations - reoriented, unmatched, = rescript.actions.orient_seqs( - sequences=self.seqs, reference_sequences=self.ref, - perc_identity=1.0) - unmatched_ids = { - seq.metadata['id'] for seq in unmatched.view(DNAIterator)} - # seq A1 A2 and A3 were modified in self.seqs to have sundry mismatches - exp_unmatched_ids = {'JUNK', 'MOREJUNK', 'A1', 'A2', 'A3'} - self.assertEqual(unmatched_ids, exp_unmatched_ids) - reoriented_ids = { - seq.metadata['id'] for seq in reoriented.view(DNAIterator)} - exp_reoriented_ids = {seq.metadata['id'] for seq in - self.ref.view(DNAIterator)} - exp_unmatched_ids - self.assertEqual(reoriented_ids, exp_reoriented_ids) - - def test_reorient_left_justify(self): - # just check correct IDs are returned; default test checks orientations - reoriented, unmatched, = rescript.actions.orient_seqs( - sequences=self.seqs, reference_sequences=self.ref, - left_justify=True) - unmatched_ids = { - seq.metadata['id'] for seq in unmatched.view(DNAIterator)} - # seq A2 was modified in self.seqs to have gaps at the 5' end - exp_unmatched_ids = {'JUNK', 'MOREJUNK', 'A2'} - self.assertEqual(unmatched_ids, exp_unmatched_ids) - reoriented_ids = { - seq.metadata['id'] for seq in reoriented.view(DNAIterator)} - exp_reoriented_ids = {seq.metadata['id'] for seq in - self.ref.view(DNAIterator)} - exp_unmatched_ids - self.assertEqual(reoriented_ids, exp_reoriented_ids) - def test_reorient_no_ref(self): reoriented, unmatched = rescript.actions.orient_seqs( sequences=self.seqs, reference_sequences=None, @@ -110,3 +79,28 @@ def test_reorient_no_ref(self): for exp, test in zip(*(exp_seqs, test_seqs)): self.assertEqual(str(exp), str(test)) self.assertEqual(exp.metadata['id'], test.metadata['id']) + + def test_add_optional_parameters(self): + expected = [ + "--dbmask", "dust", + "--relabel", "new_id", + "--relabel_keep", + "--relabel_md5", + "--relabel_self", + "--relabel_sha1", + "--sizein", + "--sizeout", + ] + result = [] + _add_optional_parameters( + result, + dbmask="dust", + relabel="new_id", + relabel_keep=True, + relabel_md5=True, + relabel_self=True, + relabel_sha1=True, + sizein=True, + sizeout=True, + ) + self.assertEqual(result, expected)