Skip to content

Commit

Permalink
Output all mappings unless preferred layer is specified
Browse files Browse the repository at this point in the history
Co-authored-by: Ben Capodanno <[email protected]>
  • Loading branch information
sallybg and bencap committed Jun 14, 2024
1 parent 04acef7 commit c078256
Show file tree
Hide file tree
Showing 5 changed files with 57 additions and 31 deletions.
45 changes: 35 additions & 10 deletions src/dcd_mapping/annotate.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Annotate MaveDB score set metadata with mapped scores."""

import datetime
import json
import logging
Expand Down Expand Up @@ -464,6 +465,7 @@ def save_mapped_output_json(
align_result: AlignmentResult,
tx_output: TxSelectResult | None,
include_vrs_2: bool = False,
preferred_layer_only: bool = False,
output_path: Path | None = None,
) -> Path:
"""Save mapping output for a score set in a JSON file
Expand All @@ -477,24 +479,47 @@ def save_mapped_output_json(
<dcd_mapping_data_dir>/urn:mavedb:00000XXX-X-X_mapping_<ISO8601 datetime>.json
:return: output location
"""
preferred_layer = _set_scoreset_layer(urn, mappings)
metadata = get_raw_scoreset_metadata(urn)
computed_reference_sequence = _get_computed_reference_sequence(
urn, preferred_layer, tx_output
)
mapped_reference_sequence = _get_mapped_reference_sequence(
preferred_layer, tx_output, align_result
)
if preferred_layer_only:
preferred_layers = {
_set_scoreset_layer(urn, mappings),
}
else:
preferred_layers = {mapping.annotation_layer for mapping in mappings}

reference_sequences = {
layer: {"computed_reference_sequence": None, "mapped_reference_sequence": None}
for layer in AnnotationLayer
}

for layer in preferred_layers:
reference_sequences[layer][
"computed_reference_sequence"
] = _get_computed_reference_sequence(urn, layer, tx_output)
reference_sequences[layer][
"mapped_reference_sequence"
] = _get_mapped_reference_sequence(layer, tx_output, align_result)

mapped_scores: list[ScoreAnnotation] = []
for m in mappings:
if m.annotation_layer == preferred_layer:
if m.annotation_layer in preferred_layers:
# drop annotation layer from mapping object
mapped_scores.append(ScoreAnnotation(**m.model_dump()))

output = ScoresetMapping(
metadata=metadata,
computed_reference_sequence=computed_reference_sequence,
mapped_reference_sequence=mapped_reference_sequence,
computed_protein_reference_sequence=reference_sequences[
AnnotationLayer.PROTEIN
]["computed_reference_sequence"],
mapped_protein_reference_sequence=reference_sequences[AnnotationLayer.PROTEIN][
"mapped_reference_sequence"
],
computed_genomic_reference_sequence=reference_sequences[
AnnotationLayer.GENOMIC
]["computed_reference_sequence"],
mapped_genomic_reference_sequence=reference_sequences[AnnotationLayer.GENOMIC][
"mapped_reference_sequence"
],
mapped_scores=mapped_scores,
)

Expand Down
12 changes: 11 additions & 1 deletion src/dcd_mapping/cli.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Provide command-line interface for accessing mapping functions."""

import asyncio
import logging
from pathlib import Path
Expand Down Expand Up @@ -38,11 +39,18 @@
default=False,
help="Include VRS 2.0 mappings",
)
@click.option(
"--prefer_genomic",
is_flag=True,
default=False,
help="If mapped variants are available relative to a genomic sequence, only output the genomic mappings",
)
def cli(
urn: str,
debug: bool,
output: Path | None,
include_vrs_2: bool,
prefer_genomic: bool,
) -> None:
"""Get VRS mapping on preferred transcript for URN.
Expand All @@ -63,7 +71,9 @@ def cli(
)
_logger.debug("debug logging enabled")
try:
asyncio.run(map_scoreset_urn(urn, output, include_vrs_2, silent=False))
asyncio.run(
map_scoreset_urn(urn, output, include_vrs_2, prefer_genomic, silent=False)
)
except (
LookupError,
AlignmentError,
Expand Down
8 changes: 7 additions & 1 deletion src/dcd_mapping/main.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Provide core MaveDB mapping methods."""

import logging
import os
import subprocess
Expand Down Expand Up @@ -123,6 +124,7 @@ async def map_scoreset(
records: list[ScoreRow],
output_path: Path | None = None,
include_vrs_2: bool = False,
prefer_genomic: bool = False,
silent: bool = True,
) -> None:
"""Given information about a MAVE experiment, map to VRS and save output as JSON.
Expand Down Expand Up @@ -182,6 +184,7 @@ async def map_scoreset(
alignment_result,
transcript,
include_vrs_2,
prefer_genomic,
output_path,
)
_emit_info(f"Annotated scores saved to: {final_output}.", silent)
Expand All @@ -191,6 +194,7 @@ async def map_scoreset_urn(
urn: str,
output_path: Path | None = None,
include_vrs_2: bool = False,
prefer_genomic: bool = False,
silent: bool = True,
) -> None:
"""Perform end-to-end mapping for a scoreset.
Expand All @@ -208,4 +212,6 @@ async def map_scoreset_urn(
_logger.critical(msg)
click.echo(f"Error: {msg}")
raise e
await map_scoreset(metadata, records, output_path, include_vrs_2, silent)
await map_scoreset(
metadata, records, output_path, include_vrs_2, prefer_genomic, silent
)
6 changes: 4 additions & 2 deletions src/dcd_mapping/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,8 @@ class ScoresetMapping(BaseModel):
"""Provide all mapped scores for a scoreset."""

metadata: Any # TODO get exact MaveDB metadata structure?
computed_reference_sequence: ComputedReferenceSequence
mapped_reference_sequence: MappedReferenceSequence
computed_protein_reference_sequence: ComputedReferenceSequence | None
mapped_protein_reference_sequence: MappedReferenceSequence | None
computed_genomic_reference_sequence: ComputedReferenceSequence | None
mapped_genomic_reference_sequence: MappedReferenceSequence | None
mapped_scores: list[ScoreAnnotation]
17 changes: 0 additions & 17 deletions src/dcd_mapping/vrs_map.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""Map transcripts to VRS objects."""

import logging
import re
from collections.abc import Iterable
from itertools import cycle

Expand All @@ -18,7 +17,6 @@
SequenceString,
)
from ga4gh.vrs.normalize import normalize
from mavehgvs import patterns
from mavehgvs.util import parse_variant_strings
from mavehgvs.variant import Variant

Expand Down Expand Up @@ -105,11 +103,6 @@ def _create_pre_mapped_hgvs_strings(
msg = f"Variant could not be parsed by mavehgvs: {error}"
raise ValueError(msg)

# TODO protein fs hgvs strings are not supported because they can't be handled by vrs allele translator
if re.search(patterns.protein.pro_fs, str(variant)):
msg = f"Pre-map VRS translation not supported for fs variants denoted with protein hgvs strings. Offending variant was: {variant}"
raise NotImplementedError(msg)

# Ideally we would create an HGVS string namespaced to GA4GH. The line below
# creates such a string, but it is not able to be parsed by the GA4GH VRS translator.
# hgvs_strings.append('ga4gh:' + sequence_id + ':' + str(variant))
Expand Down Expand Up @@ -164,11 +157,6 @@ def _create_post_mapped_hgvs_strings(
msg = f"Variant could not be parsed by mavehgvs: {error}"
raise ValueError(msg)

# TODO protein fs hgvs strings are not supported because they can't be handled by vrs allele translator
if re.search(patterns.protein.pro_fs, str(variant)):
msg = f"Post-map VRS translation not supported for fs variants denoted with protein hgvs strings. Offending variant was: {variant}"
raise NotImplementedError(msg)

if layer is AnnotationLayer.PROTEIN:
assert tx # noqa: S101. mypy help

Expand Down Expand Up @@ -586,12 +574,7 @@ def _construct_vrs_allele(
) -> Allele | Haplotype:
alleles: list[Allele] = []
for hgvs_string in hgvs_strings:
# Generate VRS Allele structure. Set VA digests and SL digests to None
allele = translate_hgvs_to_vrs(hgvs_string)
# allele.id = None
# allele.digest = None
# allele.location.id = None
# allele.location.digest = None

if pre_map:
if sequence_id is None:
Expand Down

0 comments on commit c078256

Please sign in to comment.