From aa120b000c906bd78126a4761c9568224cb170b9 Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Thu, 13 Jun 2024 15:07:51 -0700 Subject: [PATCH] Output all mappings unless preferred layer is specified Co-authored-by: Ben Capodanno <31941502+bencap@users.noreply.github.com> --- src/dcd_mapping/annotate.py | 31 +++++++++++++++++++++---------- src/dcd_mapping/cli.py | 9 ++++++++- src/dcd_mapping/main.py | 5 ++++- src/dcd_mapping/schemas.py | 6 ++++-- src/dcd_mapping/vrs_map.py | 15 --------------- 5 files changed, 37 insertions(+), 29 deletions(-) diff --git a/src/dcd_mapping/annotate.py b/src/dcd_mapping/annotate.py index 0614537..20611d9 100644 --- a/src/dcd_mapping/annotate.py +++ b/src/dcd_mapping/annotate.py @@ -464,6 +464,7 @@ def save_mapped_output_json( align_result: AlignmentResult, tx_output: TxSelectResult | None, include_vrs_2: bool = False, + preferred_layer_only: bool = False, output_path: Path | None = None, ) -> Path: """Save mapping output for a score set in a JSON file @@ -477,24 +478,34 @@ def save_mapped_output_json( /urn:mavedb:00000XXX-X-X_mapping_.json :return: output location """ - preferred_layer = _set_scoreset_layer(urn, mappings) metadata = get_raw_scoreset_metadata(urn) - computed_reference_sequence = _get_computed_reference_sequence( - urn, preferred_layer, tx_output - ) - mapped_reference_sequence = _get_mapped_reference_sequence( - preferred_layer, tx_output, align_result - ) + if preferred_layer_only: + preferred_layers = {_set_scoreset_layer(urn, mappings),} + else: + preferred_layers = {mapping.annotation_layer for mapping in mappings} + + reference_sequences = {layer: {"computed_reference_sequence": None, "mapped_reference_sequence": None} for layer in AnnotationLayer} + + for layer in preferred_layers: + reference_sequences[layer]["computed_reference_sequence"] = _get_computed_reference_sequence( + urn, layer, tx_output + ) + reference_sequences[layer]["mapped_reference_sequence"] = _get_mapped_reference_sequence( + layer, tx_output, align_result + ) + mapped_scores: list[ScoreAnnotation] = [] for m in mappings: - if m.annotation_layer == preferred_layer: + if m.annotation_layer in preferred_layers: # drop annotation layer from mapping object mapped_scores.append(ScoreAnnotation(**m.model_dump())) output = ScoresetMapping( metadata=metadata, - computed_reference_sequence=computed_reference_sequence, - mapped_reference_sequence=mapped_reference_sequence, + computed_protein_reference_sequence=reference_sequences[AnnotationLayer.PROTEIN]["computed_reference_sequence"], + mapped_protein_reference_sequence=reference_sequences[AnnotationLayer.PROTEIN]["mapped_reference_sequence"], + computed_genomic_reference_sequence=reference_sequences[AnnotationLayer.GENOMIC]["computed_reference_sequence"], + mapped_genomic_reference_sequence=reference_sequences[AnnotationLayer.GENOMIC]["mapped_reference_sequence"], mapped_scores=mapped_scores, ) diff --git a/src/dcd_mapping/cli.py b/src/dcd_mapping/cli.py index 2d8c9eb..e678024 100644 --- a/src/dcd_mapping/cli.py +++ b/src/dcd_mapping/cli.py @@ -38,11 +38,18 @@ default=False, help="Include VRS 2.0 mappings", ) +@click.option( + "--prefer_genomic", + is_flag=True, + default=False, + help="If mapped variants are available relative to a genomic sequence, only output the genomic mappings", +) def cli( urn: str, debug: bool, output: Path | None, include_vrs_2: bool, + prefer_genomic: bool, ) -> None: """Get VRS mapping on preferred transcript for URN. @@ -63,7 +70,7 @@ def cli( ) _logger.debug("debug logging enabled") try: - asyncio.run(map_scoreset_urn(urn, output, include_vrs_2, silent=False)) + asyncio.run(map_scoreset_urn(urn, output, include_vrs_2, prefer_genomic, silent=False)) except ( LookupError, AlignmentError, diff --git a/src/dcd_mapping/main.py b/src/dcd_mapping/main.py index afed6ee..3803360 100644 --- a/src/dcd_mapping/main.py +++ b/src/dcd_mapping/main.py @@ -123,6 +123,7 @@ async def map_scoreset( records: list[ScoreRow], output_path: Path | None = None, include_vrs_2: bool = False, + prefer_genomic: bool = False, silent: bool = True, ) -> None: """Given information about a MAVE experiment, map to VRS and save output as JSON. @@ -182,6 +183,7 @@ async def map_scoreset( alignment_result, transcript, include_vrs_2, + prefer_genomic, output_path, ) _emit_info(f"Annotated scores saved to: {final_output}.", silent) @@ -191,6 +193,7 @@ async def map_scoreset_urn( urn: str, output_path: Path | None = None, include_vrs_2: bool = False, + prefer_genomic: bool = False, silent: bool = True, ) -> None: """Perform end-to-end mapping for a scoreset. @@ -208,4 +211,4 @@ async def map_scoreset_urn( _logger.critical(msg) click.echo(f"Error: {msg}") raise e - await map_scoreset(metadata, records, output_path, include_vrs_2, silent) + await map_scoreset(metadata, records, output_path, include_vrs_2, prefer_genomic, silent) diff --git a/src/dcd_mapping/schemas.py b/src/dcd_mapping/schemas.py index 1425515..e77d385 100644 --- a/src/dcd_mapping/schemas.py +++ b/src/dcd_mapping/schemas.py @@ -180,6 +180,8 @@ class ScoresetMapping(BaseModel): """Provide all mapped scores for a scoreset.""" metadata: Any # TODO get exact MaveDB metadata structure? - computed_reference_sequence: ComputedReferenceSequence - mapped_reference_sequence: MappedReferenceSequence + computed_protein_reference_sequence: ComputedReferenceSequence | None + mapped_protein_reference_sequence: MappedReferenceSequence | None + computed_genomic_reference_sequence: ComputedReferenceSequence | None + mapped_genomic_reference_sequence: MappedReferenceSequence | None mapped_scores: list[ScoreAnnotation] diff --git a/src/dcd_mapping/vrs_map.py b/src/dcd_mapping/vrs_map.py index c230db5..692b70d 100644 --- a/src/dcd_mapping/vrs_map.py +++ b/src/dcd_mapping/vrs_map.py @@ -105,11 +105,6 @@ def _create_pre_mapped_hgvs_strings( msg = f"Variant could not be parsed by mavehgvs: {error}" raise ValueError(msg) - # TODO protein fs hgvs strings are not supported because they can't be handled by vrs allele translator - if re.search(patterns.protein.pro_fs, str(variant)): - msg = f"Pre-map VRS translation not supported for fs variants denoted with protein hgvs strings. Offending variant was: {variant}" - raise NotImplementedError(msg) - # Ideally we would create an HGVS string namespaced to GA4GH. The line below # creates such a string, but it is not able to be parsed by the GA4GH VRS translator. # hgvs_strings.append('ga4gh:' + sequence_id + ':' + str(variant)) @@ -164,11 +159,6 @@ def _create_post_mapped_hgvs_strings( msg = f"Variant could not be parsed by mavehgvs: {error}" raise ValueError(msg) - # TODO protein fs hgvs strings are not supported because they can't be handled by vrs allele translator - if re.search(patterns.protein.pro_fs, str(variant)): - msg = f"Post-map VRS translation not supported for fs variants denoted with protein hgvs strings. Offending variant was: {variant}" - raise NotImplementedError(msg) - if layer is AnnotationLayer.PROTEIN: assert tx # noqa: S101. mypy help @@ -586,12 +576,7 @@ def _construct_vrs_allele( ) -> Allele | Haplotype: alleles: list[Allele] = [] for hgvs_string in hgvs_strings: - # Generate VRS Allele structure. Set VA digests and SL digests to None allele = translate_hgvs_to_vrs(hgvs_string) - # allele.id = None - # allele.digest = None - # allele.location.id = None - # allele.location.digest = None if pre_map: if sequence_id is None: