Output all mappings unless preferred layer is specified

Co-authored-by: Ben Capodanno <[email protected]>
VariantEffect · Jun 14, 2024 · c078256 · c078256
1 parent 04acef7
commit c078256
Show file tree

Hide file tree

Showing 5 changed files with 57 additions and 31 deletions.
diff --git a/src/dcd_mapping/annotate.py b/src/dcd_mapping/annotate.py
@@ -1,4 +1,5 @@
 """Annotate MaveDB score set metadata with mapped scores."""
+
 import datetime
 import json
 import logging
@@ -464,6 +465,7 @@ def save_mapped_output_json(
     align_result: AlignmentResult,
     tx_output: TxSelectResult | None,
     include_vrs_2: bool = False,
+    preferred_layer_only: bool = False,
     output_path: Path | None = None,
 ) -> Path:
     """Save mapping output for a score set in a JSON file
@@ -477,24 +479,47 @@ def save_mapped_output_json(
         <dcd_mapping_data_dir>/urn:mavedb:00000XXX-X-X_mapping_<ISO8601 datetime>.json
     :return: output location
     """
-    preferred_layer = _set_scoreset_layer(urn, mappings)
     metadata = get_raw_scoreset_metadata(urn)
-    computed_reference_sequence = _get_computed_reference_sequence(
-        urn, preferred_layer, tx_output
-    )
-    mapped_reference_sequence = _get_mapped_reference_sequence(
-        preferred_layer, tx_output, align_result
-    )
+    if preferred_layer_only:
+        preferred_layers = {
+            _set_scoreset_layer(urn, mappings),
+        }
+    else:
+        preferred_layers = {mapping.annotation_layer for mapping in mappings}
+
+    reference_sequences = {
+        layer: {"computed_reference_sequence": None, "mapped_reference_sequence": None}
+        for layer in AnnotationLayer
+    }
+
+    for layer in preferred_layers:
+        reference_sequences[layer][
+            "computed_reference_sequence"
+        ] = _get_computed_reference_sequence(urn, layer, tx_output)
+        reference_sequences[layer][
+            "mapped_reference_sequence"
+        ] = _get_mapped_reference_sequence(layer, tx_output, align_result)
+
     mapped_scores: list[ScoreAnnotation] = []
     for m in mappings:
-        if m.annotation_layer == preferred_layer:
+        if m.annotation_layer in preferred_layers:
             # drop annotation layer from mapping object
             mapped_scores.append(ScoreAnnotation(**m.model_dump()))
 
     output = ScoresetMapping(
         metadata=metadata,
-        computed_reference_sequence=computed_reference_sequence,
-        mapped_reference_sequence=mapped_reference_sequence,
+        computed_protein_reference_sequence=reference_sequences[
+            AnnotationLayer.PROTEIN
+        ]["computed_reference_sequence"],
+        mapped_protein_reference_sequence=reference_sequences[AnnotationLayer.PROTEIN][
+            "mapped_reference_sequence"
+        ],
+        computed_genomic_reference_sequence=reference_sequences[
+            AnnotationLayer.GENOMIC
+        ]["computed_reference_sequence"],
+        mapped_genomic_reference_sequence=reference_sequences[AnnotationLayer.GENOMIC][
+            "mapped_reference_sequence"
+        ],
         mapped_scores=mapped_scores,
     )
 

diff --git a/src/dcd_mapping/cli.py b/src/dcd_mapping/cli.py
@@ -1,4 +1,5 @@
 """Provide command-line interface for accessing mapping functions."""
+
 import asyncio
 import logging
 from pathlib import Path
@@ -38,11 +39,18 @@
     default=False,
     help="Include VRS 2.0 mappings",
 )
+@click.option(
+    "--prefer_genomic",
+    is_flag=True,
+    default=False,
+    help="If mapped variants are available relative to a genomic sequence, only output the genomic mappings",
+)
 def cli(
     urn: str,
     debug: bool,
     output: Path | None,
     include_vrs_2: bool,
+    prefer_genomic: bool,
 ) -> None:
     """Get VRS mapping on preferred transcript for URN.
 
@@ -63,7 +71,9 @@ def cli(
     )
     _logger.debug("debug logging enabled")
     try:
-        asyncio.run(map_scoreset_urn(urn, output, include_vrs_2, silent=False))
+        asyncio.run(
+            map_scoreset_urn(urn, output, include_vrs_2, prefer_genomic, silent=False)
+        )
     except (
         LookupError,
         AlignmentError,

diff --git a/src/dcd_mapping/main.py b/src/dcd_mapping/main.py
@@ -1,4 +1,5 @@
 """Provide core MaveDB mapping methods."""
+
 import logging
 import os
 import subprocess
@@ -123,6 +124,7 @@ async def map_scoreset(
     records: list[ScoreRow],
     output_path: Path | None = None,
     include_vrs_2: bool = False,
+    prefer_genomic: bool = False,
     silent: bool = True,
 ) -> None:
     """Given information about a MAVE experiment, map to VRS and save output as JSON.
@@ -182,6 +184,7 @@ async def map_scoreset(
         alignment_result,
         transcript,
         include_vrs_2,
+        prefer_genomic,
         output_path,
     )
     _emit_info(f"Annotated scores saved to: {final_output}.", silent)
@@ -191,6 +194,7 @@ async def map_scoreset_urn(
     urn: str,
     output_path: Path | None = None,
     include_vrs_2: bool = False,
+    prefer_genomic: bool = False,
     silent: bool = True,
 ) -> None:
     """Perform end-to-end mapping for a scoreset.
@@ -208,4 +212,6 @@ async def map_scoreset_urn(
         _logger.critical(msg)
         click.echo(f"Error: {msg}")
         raise e
-    await map_scoreset(metadata, records, output_path, include_vrs_2, silent)
+    await map_scoreset(
+        metadata, records, output_path, include_vrs_2, prefer_genomic, silent
+    )
diff --git a/src/dcd_mapping/schemas.py b/src/dcd_mapping/schemas.py
@@ -180,6 +180,8 @@ class ScoresetMapping(BaseModel):
     """Provide all mapped scores for a scoreset."""
 
     metadata: Any  # TODO get exact MaveDB metadata structure?
-    computed_reference_sequence: ComputedReferenceSequence
-    mapped_reference_sequence: MappedReferenceSequence
+    computed_protein_reference_sequence: ComputedReferenceSequence | None
+    mapped_protein_reference_sequence: MappedReferenceSequence | None
+    computed_genomic_reference_sequence: ComputedReferenceSequence | None
+    mapped_genomic_reference_sequence: MappedReferenceSequence | None
     mapped_scores: list[ScoreAnnotation]
diff --git a/src/dcd_mapping/vrs_map.py b/src/dcd_mapping/vrs_map.py
@@ -1,7 +1,6 @@
 """Map transcripts to VRS objects."""
 
 import logging
-import re
 from collections.abc import Iterable
 from itertools import cycle
 
@@ -18,7 +17,6 @@
     SequenceString,
 )
 from ga4gh.vrs.normalize import normalize
-from mavehgvs import patterns
 from mavehgvs.util import parse_variant_strings
 from mavehgvs.variant import Variant
 
@@ -105,11 +103,6 @@ def _create_pre_mapped_hgvs_strings(
             msg = f"Variant could not be parsed by mavehgvs: {error}"
             raise ValueError(msg)
 
-        # TODO protein fs hgvs strings are not supported because they can't be handled by vrs allele translator
-        if re.search(patterns.protein.pro_fs, str(variant)):
-            msg = f"Pre-map VRS translation not supported for fs variants denoted with protein hgvs strings. Offending variant was: {variant}"
-            raise NotImplementedError(msg)
-
         # Ideally we would create an HGVS string namespaced to GA4GH. The line below
         # creates such a string, but it is not able to be parsed by the GA4GH VRS translator.
         # hgvs_strings.append('ga4gh:' + sequence_id + ':' + str(variant))
@@ -164,11 +157,6 @@ def _create_post_mapped_hgvs_strings(
             msg = f"Variant could not be parsed by mavehgvs: {error}"
             raise ValueError(msg)
 
-        # TODO protein fs hgvs strings are not supported because they can't be handled by vrs allele translator
-        if re.search(patterns.protein.pro_fs, str(variant)):
-            msg = f"Post-map VRS translation not supported for fs variants denoted with protein hgvs strings. Offending variant was: {variant}"
-            raise NotImplementedError(msg)
-
         if layer is AnnotationLayer.PROTEIN:
             assert tx  # noqa: S101. mypy help
 
@@ -586,12 +574,7 @@ def _construct_vrs_allele(
 ) -> Allele | Haplotype:
     alleles: list[Allele] = []
     for hgvs_string in hgvs_strings:
-        # Generate VRS Allele structure. Set VA digests and SL digests to None
         allele = translate_hgvs_to_vrs(hgvs_string)
-        # allele.id = None
-        # allele.digest = None
-        # allele.location.id = None
-        # allele.location.digest = None
 
         if pre_map:
             if sequence_id is None: