Skip to content

Commit

Permalink
Output all mappings unless preferred layer is specified
Browse files Browse the repository at this point in the history
Co-authored-by: Ben Capodanno <[email protected]>
  • Loading branch information
sallybg and bencap committed Jun 13, 2024
1 parent 6327830 commit aa120b0
Show file tree
Hide file tree
Showing 5 changed files with 37 additions and 29 deletions.
31 changes: 21 additions & 10 deletions src/dcd_mapping/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -464,6 +464,7 @@ def save_mapped_output_json(
align_result: AlignmentResult,
tx_output: TxSelectResult | None,
include_vrs_2: bool = False,
preferred_layer_only: bool = False,
output_path: Path | None = None,
) -> Path:
"""Save mapping output for a score set in a JSON file
Expand All @@ -477,24 +478,34 @@ def save_mapped_output_json(
<dcd_mapping_data_dir>/urn:mavedb:00000XXX-X-X_mapping_<ISO8601 datetime>.json
:return: output location
"""
preferred_layer = _set_scoreset_layer(urn, mappings)
metadata = get_raw_scoreset_metadata(urn)
computed_reference_sequence = _get_computed_reference_sequence(
urn, preferred_layer, tx_output
)
mapped_reference_sequence = _get_mapped_reference_sequence(
preferred_layer, tx_output, align_result
)
if preferred_layer_only:
preferred_layers = {_set_scoreset_layer(urn, mappings),}
else:
preferred_layers = {mapping.annotation_layer for mapping in mappings}

reference_sequences = {layer: {"computed_reference_sequence": None, "mapped_reference_sequence": None} for layer in AnnotationLayer}

for layer in preferred_layers:
reference_sequences[layer]["computed_reference_sequence"] = _get_computed_reference_sequence(
urn, layer, tx_output
)
reference_sequences[layer]["mapped_reference_sequence"] = _get_mapped_reference_sequence(
layer, tx_output, align_result
)

mapped_scores: list[ScoreAnnotation] = []
for m in mappings:
if m.annotation_layer == preferred_layer:
if m.annotation_layer in preferred_layers:
# drop annotation layer from mapping object
mapped_scores.append(ScoreAnnotation(**m.model_dump()))

output = ScoresetMapping(
metadata=metadata,
computed_reference_sequence=computed_reference_sequence,
mapped_reference_sequence=mapped_reference_sequence,
computed_protein_reference_sequence=reference_sequences[AnnotationLayer.PROTEIN]["computed_reference_sequence"],
mapped_protein_reference_sequence=reference_sequences[AnnotationLayer.PROTEIN]["mapped_reference_sequence"],
computed_genomic_reference_sequence=reference_sequences[AnnotationLayer.GENOMIC]["computed_reference_sequence"],
mapped_genomic_reference_sequence=reference_sequences[AnnotationLayer.GENOMIC]["mapped_reference_sequence"],
mapped_scores=mapped_scores,
)

Expand Down
9 changes: 8 additions & 1 deletion src/dcd_mapping/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,18 @@
default=False,
help="Include VRS 2.0 mappings",
)
@click.option(
"--prefer_genomic",
is_flag=True,
default=False,
help="If mapped variants are available relative to a genomic sequence, only output the genomic mappings",
)
def cli(
urn: str,
debug: bool,
output: Path | None,
include_vrs_2: bool,
prefer_genomic: bool,
) -> None:
"""Get VRS mapping on preferred transcript for URN.
Expand All @@ -63,7 +70,7 @@ def cli(
)
_logger.debug("debug logging enabled")
try:
asyncio.run(map_scoreset_urn(urn, output, include_vrs_2, silent=False))
asyncio.run(map_scoreset_urn(urn, output, include_vrs_2, prefer_genomic, silent=False))
except (
LookupError,
AlignmentError,
Expand Down
5 changes: 4 additions & 1 deletion src/dcd_mapping/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ async def map_scoreset(
records: list[ScoreRow],
output_path: Path | None = None,
include_vrs_2: bool = False,
prefer_genomic: bool = False,
silent: bool = True,
) -> None:
"""Given information about a MAVE experiment, map to VRS and save output as JSON.
Expand Down Expand Up @@ -182,6 +183,7 @@ async def map_scoreset(
alignment_result,
transcript,
include_vrs_2,
prefer_genomic,
output_path,
)
_emit_info(f"Annotated scores saved to: {final_output}.", silent)
Expand All @@ -191,6 +193,7 @@ async def map_scoreset_urn(
urn: str,
output_path: Path | None = None,
include_vrs_2: bool = False,
prefer_genomic: bool = False,
silent: bool = True,
) -> None:
"""Perform end-to-end mapping for a scoreset.
Expand All @@ -208,4 +211,4 @@ async def map_scoreset_urn(
_logger.critical(msg)
click.echo(f"Error: {msg}")
raise e
await map_scoreset(metadata, records, output_path, include_vrs_2, silent)
await map_scoreset(metadata, records, output_path, include_vrs_2, prefer_genomic, silent)
6 changes: 4 additions & 2 deletions src/dcd_mapping/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,8 @@ class ScoresetMapping(BaseModel):
"""Provide all mapped scores for a scoreset."""

metadata: Any # TODO get exact MaveDB metadata structure?
computed_reference_sequence: ComputedReferenceSequence
mapped_reference_sequence: MappedReferenceSequence
computed_protein_reference_sequence: ComputedReferenceSequence | None
mapped_protein_reference_sequence: MappedReferenceSequence | None
computed_genomic_reference_sequence: ComputedReferenceSequence | None
mapped_genomic_reference_sequence: MappedReferenceSequence | None
mapped_scores: list[ScoreAnnotation]
15 changes: 0 additions & 15 deletions src/dcd_mapping/vrs_map.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,11 +105,6 @@ def _create_pre_mapped_hgvs_strings(
msg = f"Variant could not be parsed by mavehgvs: {error}"
raise ValueError(msg)

# TODO protein fs hgvs strings are not supported because they can't be handled by vrs allele translator
if re.search(patterns.protein.pro_fs, str(variant)):
msg = f"Pre-map VRS translation not supported for fs variants denoted with protein hgvs strings. Offending variant was: {variant}"
raise NotImplementedError(msg)

# Ideally we would create an HGVS string namespaced to GA4GH. The line below
# creates such a string, but it is not able to be parsed by the GA4GH VRS translator.
# hgvs_strings.append('ga4gh:' + sequence_id + ':' + str(variant))
Expand Down Expand Up @@ -164,11 +159,6 @@ def _create_post_mapped_hgvs_strings(
msg = f"Variant could not be parsed by mavehgvs: {error}"
raise ValueError(msg)

# TODO protein fs hgvs strings are not supported because they can't be handled by vrs allele translator
if re.search(patterns.protein.pro_fs, str(variant)):
msg = f"Post-map VRS translation not supported for fs variants denoted with protein hgvs strings. Offending variant was: {variant}"
raise NotImplementedError(msg)

if layer is AnnotationLayer.PROTEIN:
assert tx # noqa: S101. mypy help

Expand Down Expand Up @@ -586,12 +576,7 @@ def _construct_vrs_allele(
) -> Allele | Haplotype:
alleles: list[Allele] = []
for hgvs_string in hgvs_strings:
# Generate VRS Allele structure. Set VA digests and SL digests to None
allele = translate_hgvs_to_vrs(hgvs_string)
# allele.id = None
# allele.digest = None
# allele.location.id = None
# allele.location.digest = None

if pre_map:
if sequence_id is None:
Expand Down

0 comments on commit aa120b0

Please sign in to comment.