From 6d1bbf73b488cb4b3c138739fdbc81079c834b78 Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Fri, 14 Jun 2024 16:47:22 -0700 Subject: [PATCH 01/16] Initialize dcd_mapping api server Co-authored-by: Ben Capodanno <31941502+bencap@users.noreply.github.com> --- Dockerfile | 11 ++++++ docker-compose-dev.yml | 15 ++++++++ pyproject.toml | 5 ++- src/api/__init__.py | 0 src/api/routers/__init__.py | 0 src/api/routers/map.py | 77 +++++++++++++++++++++++++++++++++++++ src/api/server_main.py | 15 ++++++++ 7 files changed, 122 insertions(+), 1 deletion(-) create mode 100644 src/api/__init__.py create mode 100644 src/api/routers/__init__.py create mode 100644 src/api/routers/map.py create mode 100644 src/api/server_main.py diff --git a/Dockerfile b/Dockerfile index 22a982b..363ee58 100644 --- a/Dockerfile +++ b/Dockerfile @@ -46,6 +46,17 @@ RUN pip install -e '.[dev,tests]' RUN pip install -U polars-lts-cpu # install gene normalizer with pg dependencies. TODO: can the pg dependencies be specified in pyproject.toml? #RUN pip install 'gene-normalizer[pg]' + +# not working, needs to happen after db volume is mounted +# ENV GENE_NORM_DB_URL=postgres://postgres:postgres@db:5432/gene_normalizer +# RUN echo "y" | gene_norm_update_remote + ENV PYTHONUNBUFFERED 1 ENV PYTHONPATH "${PYTHONPATH}:/usr/src/app/src" + +# Tell Docker that we will listen on port 8000. +EXPOSE 8000 + +# At container startup, run the application using uvicorn. +CMD ["uvicorn", "api.server_main:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/docker-compose-dev.yml b/docker-compose-dev.yml index e2edbe4..f4266f8 100644 --- a/docker-compose-dev.yml +++ b/docker-compose-dev.yml @@ -34,6 +34,21 @@ services: volumes: - vrs-mapping-seqrepo-dev:/usr/local/share/seqrepo + api: + build: + context: . + command: bash -c "uvicorn api.server_main:app --host 0.0.0.0 --port 8000 --reload" + depends_on: + - db + - seqrepo + env_file: + - settings/.env.dev + ports: + - "8004:8000" + volumes: + - .:/usr/src/app + - vrs-mapping-seqrepo-dev:/usr/local/share/seqrepo + volumes: vrs-mapping-data-dev: vrs-mapping-seqrepo-dev: diff --git a/pyproject.toml b/pyproject.toml index 10ac2db..73ad21e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,7 +42,10 @@ dependencies = [ "pydantic>=2", "python-dotenv", "setuptools>=68.0", # tmp -- ensure 3.12 compatibility - "mavehgvs==0.6.1" + "mavehgvs==0.6.1", + "fastapi", + "starlette", + "uvicorn" ] dynamic = ["version"] diff --git a/src/api/__init__.py b/src/api/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/api/routers/__init__.py b/src/api/routers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/api/routers/map.py b/src/api/routers/map.py new file mode 100644 index 0000000..27b7e59 --- /dev/null +++ b/src/api/routers/map.py @@ -0,0 +1,77 @@ +from cool_seq_tool.schemas import AnnotationLayer +from fastapi import APIRouter + +from dcd_mapping.align import align +from dcd_mapping.annotate import ( + _get_computed_reference_sequence, + _get_mapped_reference_sequence, + annotate, +) +from dcd_mapping.mavedb_data import ( + get_raw_scoreset_metadata, + get_scoreset_metadata, + get_scoreset_records, +) +from dcd_mapping.schemas import ScoreAnnotation, ScoresetMapping +from dcd_mapping.transcripts import select_transcript +from dcd_mapping.vrs_map import vrs_map + +router = APIRouter(prefix="/api/v1", tags=["mappings"], responses={404: {"description": "Not found"}}) + +@router.post(path="/map/{urn}", status_code=200, response_model=ScoresetMapping) +async def map_scoreset(urn: str) -> ScoresetMapping: + metadata = get_scoreset_metadata(urn) + records = get_scoreset_records(urn, True) + + alignment_result = align(metadata, True) + + transcript = await select_transcript(metadata, records, alignment_result) + + vrs_results = vrs_map(metadata, alignment_result, records, transcript, True) + + # TODO raise server error if vrs_results is None + if vrs_results is None: + return None + + vrs_results = annotate(vrs_results, transcript, metadata) + + raw_metadata = get_raw_scoreset_metadata(urn) + preferred_layers = {mapping.annotation_layer for mapping in vrs_results} + + reference_sequences = { + layer: {"computed_reference_sequence": None, "mapped_reference_sequence": None} + for layer in AnnotationLayer + } + + for layer in preferred_layers: + reference_sequences[layer][ + "computed_reference_sequence" + ] = _get_computed_reference_sequence(urn, layer, transcript) + reference_sequences[layer][ + "mapped_reference_sequence" + ] = _get_mapped_reference_sequence(layer, transcript, alignment_result) + + mapped_scores: list[ScoreAnnotation] = [] + for m in vrs_results: + if m.annotation_layer in preferred_layers: + # drop annotation layer from mapping object + mapped_scores.append(ScoreAnnotation(**m.model_dump())) + + output = ScoresetMapping( + metadata=raw_metadata, + computed_protein_reference_sequence=reference_sequences[ + AnnotationLayer.PROTEIN + ]["computed_reference_sequence"], + mapped_protein_reference_sequence=reference_sequences[AnnotationLayer.PROTEIN][ + "mapped_reference_sequence" + ], + computed_genomic_reference_sequence=reference_sequences[ + AnnotationLayer.GENOMIC + ]["computed_reference_sequence"], + mapped_genomic_reference_sequence=reference_sequences[AnnotationLayer.GENOMIC][ + "mapped_reference_sequence" + ], + mapped_scores=mapped_scores, + ) + + return output diff --git a/src/api/server_main.py b/src/api/server_main.py new file mode 100644 index 0000000..b83da73 --- /dev/null +++ b/src/api/server_main.py @@ -0,0 +1,15 @@ +"""FastAPI server file""" +import uvicorn +from fastapi import FastAPI + +from api.routers import map + +app = FastAPI() + +app.include_router(map.router) + + +# If the application is not already being run within a uvicorn server, start uvicorn here. +if __name__ == "__main__": + uvicorn.run(app, host="0.0.0.0", port=8000) + From 18e340cf673527934810bd10241b579deea7f0b5 Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Tue, 18 Jun 2024 15:25:45 -0700 Subject: [PATCH 02/16] Have the api create vrs mapped objects with preferred layer only Co-authored-by: Ben Capodanno <31941502+bencap@users.noreply.github.com> --- src/api/routers/map.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/api/routers/map.py b/src/api/routers/map.py index 27b7e59..a226c81 100644 --- a/src/api/routers/map.py +++ b/src/api/routers/map.py @@ -5,6 +5,7 @@ from dcd_mapping.annotate import ( _get_computed_reference_sequence, _get_mapped_reference_sequence, + _set_scoreset_layer, annotate, ) from dcd_mapping.mavedb_data import ( @@ -36,7 +37,11 @@ async def map_scoreset(urn: str) -> ScoresetMapping: vrs_results = annotate(vrs_results, transcript, metadata) raw_metadata = get_raw_scoreset_metadata(urn) - preferred_layers = {mapping.annotation_layer for mapping in vrs_results} + # TODO change vrs map back to always use only the preferred layer + #preferred_layers = {mapping.annotation_layer for mapping in vrs_results} + preferred_layers = { + _set_scoreset_layer(urn, vrs_results), + } reference_sequences = { layer: {"computed_reference_sequence": None, "mapped_reference_sequence": None} From 0b1c7c1bc880b8ba46c9e00c37395f1d0434cf80 Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Thu, 20 Jun 2024 15:19:42 -0700 Subject: [PATCH 03/16] squash: formatting changes --- src/api/routers/map.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/api/routers/map.py b/src/api/routers/map.py index a226c81..ca354d4 100644 --- a/src/api/routers/map.py +++ b/src/api/routers/map.py @@ -17,7 +17,10 @@ from dcd_mapping.transcripts import select_transcript from dcd_mapping.vrs_map import vrs_map -router = APIRouter(prefix="/api/v1", tags=["mappings"], responses={404: {"description": "Not found"}}) +router = APIRouter( + prefix="/api/v1", tags=["mappings"], responses={404: {"description": "Not found"}} +) + @router.post(path="/map/{urn}", status_code=200, response_model=ScoresetMapping) async def map_scoreset(urn: str) -> ScoresetMapping: @@ -38,7 +41,7 @@ async def map_scoreset(urn: str) -> ScoresetMapping: raw_metadata = get_raw_scoreset_metadata(urn) # TODO change vrs map back to always use only the preferred layer - #preferred_layers = {mapping.annotation_layer for mapping in vrs_results} + # preferred_layers = {mapping.annotation_layer for mapping in vrs_results} preferred_layers = { _set_scoreset_layer(urn, vrs_results), } From 06caf580430de813df63c5fafeb1cff1212220b0 Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Wed, 7 Aug 2024 15:10:52 -0700 Subject: [PATCH 04/16] Allow pre-map genomic variant without corresponding post-map genomic variant Currently, the only case where this will occur is when the variant falls outside of the region of the target sequence that aligns to the genomic reference sequence. Post-map protein variants are still required to output the corresponding pre-map protein variant, because we don't expect a protein variant to fall outside the region of the target sequence that aligns to the reference transcript, although this assumption should be evaluated as a separate issue. --- src/dcd_mapping/annotate.py | 82 +++++++++++++++++++------------------ src/dcd_mapping/schemas.py | 4 +- src/dcd_mapping/vrs_map.py | 41 ++++++++++++------- 3 files changed, 72 insertions(+), 55 deletions(-) diff --git a/src/dcd_mapping/annotate.py b/src/dcd_mapping/annotate.py index fc58742..24bb93a 100644 --- a/src/dcd_mapping/annotate.py +++ b/src/dcd_mapping/annotate.py @@ -251,34 +251,35 @@ def _annotate_allele_mapping( post_mapped: Allele = mapped_score.post_mapped # get vrs_ref_allele_seq for pre-mapped variants - pre_mapped.extensions = [_get_vrs_ref_allele_seq(post_mapped, metadata, tx_results)] - - # Determine reference sequence - if mapped_score.annotation_layer == AnnotationLayer.GENOMIC: - sequence_id = f"ga4gh:{mapped_score.post_mapped.location.sequenceReference.refgetAccession}" - accession = get_chromosome_identifier_from_vrs_id(sequence_id) - if accession is None: - raise ValueError - if accession.startswith("refseq:"): - accession = accession[7:] - else: - if tx_results is None: - raise ValueError # impossible by definition - accession = tx_results.np + pre_mapped.extensions = [_get_vrs_ref_allele_seq(pre_mapped, metadata, tx_results)] + + if post_mapped: + # Determine reference sequence + if mapped_score.annotation_layer == AnnotationLayer.GENOMIC: + sequence_id = f"ga4gh:{mapped_score.post_mapped.location.sequenceReference.refgetAccession}" + accession = get_chromosome_identifier_from_vrs_id(sequence_id) + if accession is None: + raise ValueError + if accession.startswith("refseq:"): + accession = accession[7:] + else: + if tx_results is None: + raise ValueError # impossible by definition + accession = tx_results.np - sr = get_seqrepo() - loc = mapped_score.post_mapped.location - sequence_id = f"ga4gh:{loc.sequenceReference.refgetAccession}" - ref = sr.get_sequence(sequence_id, loc.start, loc.end) - post_mapped.extensions = [ - Extension(type="Extension", name="vrs_ref_allele_seq", value=ref) - ] - hgvs_string, syntax = _get_hgvs_string(post_mapped, accession) - post_mapped.expressions = [Expression(syntax=syntax, value=hgvs_string)] + sr = get_seqrepo() + loc = mapped_score.post_mapped.location + sequence_id = f"ga4gh:{loc.sequenceReference.refgetAccession}" + ref = sr.get_sequence(sequence_id, loc.start, loc.end) + post_mapped.extensions = [ + Extension(type="Extension", name="vrs_ref_allele_seq", value=ref) + ] + hgvs_string, syntax = _get_hgvs_string(post_mapped, accession) + post_mapped.expressions = [Expression(syntax=syntax, value=hgvs_string)] if vrs_version == VrsVersion.V_1_3: pre_mapped = _allele_to_vod(pre_mapped) - post_mapped = _allele_to_vod(post_mapped) + post_mapped = _allele_to_vod(post_mapped) if post_mapped else None return ScoreAnnotationWithLayer( pre_mapped=pre_mapped, @@ -318,20 +319,21 @@ def _annotate_haplotype_mapping( raise ValueError # impossible by definition accession = tx_results.np - sr = get_seqrepo() - for allele in post_mapped.members: - loc = allele.location - sequence_id = f"ga4gh:{loc.sequenceReference.refgetAccession}" - ref = sr.get_sequence(sequence_id, loc.start, loc.end) # TODO type issues?? - allele.extensions = [ - Extension(type="Extension", name="vrs_ref_allele_seq", value=ref) - ] - hgvs, syntax = _get_hgvs_string(allele, accession) - allele.expressions = [Expression(syntax=syntax, value=hgvs)] + if post_mapped: + sr = get_seqrepo() + for allele in post_mapped.members: + loc = allele.location + sequence_id = f"ga4gh:{loc.sequenceReference.refgetAccession}" + ref = sr.get_sequence(sequence_id, loc.start, loc.end) # TODO type issues?? + allele.extensions = [ + Extension(type="Extension", name="vrs_ref_allele_seq", value=ref) + ] + hgvs, syntax = _get_hgvs_string(allele, accession) + allele.expressions = [Expression(syntax=syntax, value=hgvs)] if vrs_version == VrsVersion.V_1_3: pre_mapped = _haplotype_to_haplotype_1_3(pre_mapped) - post_mapped = _haplotype_to_haplotype_1_3(post_mapped) + post_mapped = _haplotype_to_haplotype_1_3(post_mapped) if post_mapped else None return ScoreAnnotationWithLayer( pre_mapped=pre_mapped, @@ -367,16 +369,18 @@ def annotate( """ score_annotations = [] for mapped_score in mapped_scores: - if isinstance(mapped_score.pre_mapped, Haplotype) and isinstance( - mapped_score.post_mapped, Haplotype + if isinstance(mapped_score.pre_mapped, Haplotype) and ( + isinstance(mapped_score.post_mapped, Haplotype) + or mapped_score.post_mapped is None ): score_annotations.append( _annotate_haplotype_mapping( mapped_score, tx_results, metadata, vrs_version ) ) - elif isinstance(mapped_score.pre_mapped, Allele) and isinstance( - mapped_score.post_mapped, Allele + elif isinstance(mapped_score.pre_mapped, Allele) and ( + isinstance(mapped_score.post_mapped, Allele) + or mapped_score.post_mapped is None ): score_annotations.append( _annotate_allele_mapping( diff --git a/src/dcd_mapping/schemas.py b/src/dcd_mapping/schemas.py index d470cb9..96e0619 100644 --- a/src/dcd_mapping/schemas.py +++ b/src/dcd_mapping/schemas.py @@ -155,7 +155,7 @@ class MappedScore(BaseModel): annotation_layer: AnnotationLayer score: str | None pre_mapped: Allele | Haplotype - post_mapped: Allele | Haplotype + post_mapped: Allele | Haplotype | None class ScoreAnnotation(BaseModel): @@ -165,7 +165,7 @@ class ScoreAnnotation(BaseModel): """ pre_mapped: vrs_v1_schemas.VariationDescriptor | vrs_v1_schemas.Haplotype | Allele | Haplotype - post_mapped: vrs_v1_schemas.VariationDescriptor | vrs_v1_schemas.Haplotype | Allele | Haplotype + post_mapped: vrs_v1_schemas.VariationDescriptor | vrs_v1_schemas.Haplotype | Allele | Haplotype | None vrs_version: VrsVersion mavedb_id: StrictStr relation: Literal["SO:is_homologous_to"] = "SO:is_homologous_to" diff --git a/src/dcd_mapping/vrs_map.py b/src/dcd_mapping/vrs_map.py index bf31775..9e7d728 100644 --- a/src/dcd_mapping/vrs_map.py +++ b/src/dcd_mapping/vrs_map.py @@ -166,9 +166,16 @@ def _create_post_mapped_hgvs_strings( assert alignment # noqa: S101. mypy help variant = _adjust_genomic_variant_to_ref(variant, alignment) - hgvs_strings.append( - get_chromosome_identifier(alignment.chrom) + ":" + str(variant) - ) + if variant: + hgvs_strings.append( + get_chromosome_identifier(alignment.chrom) + ":" + str(variant) + ) + else: + _logger.warning( + "Cannot process variant %s because it is not fully contained within the aligned portion of the query sequence.", + variant, + ) + return [] else: msg = ( f"Could not generate HGVS strings for invalid AnnotationLayer: {layer}" @@ -194,7 +201,7 @@ def _adjust_protein_variant_to_ref( def _adjust_genomic_variant_to_ref( variant: Variant, alignment: AlignmentResult, -) -> Variant: +) -> Variant | None: """Adjust a variant relative to a provided alignment. :param variant: A variant object relative to a scoreset's target sequence @@ -226,8 +233,7 @@ def _adjust_genomic_variant_to_ref( break if query_subrange_containing_hit is None or target_subrange_containing_hit is None: - msg = "Variant was not contained, or multi-position variant was not fully contained, within the aligned portion of the query sequence." - raise ValueError(msg) + return None for idx, start in enumerate(starts): if alignment.strand is Strand.POSITIVE: @@ -348,7 +354,7 @@ def _map_protein_coding_pro( False, ) - if pre_mapped_protein and post_mapped_protein: + if pre_mapped_protein: return MappedScore( accession_id=row.accession, score=row.score, @@ -391,14 +397,21 @@ def _map_genomic( sequence_id, True, ) - post_mapped_genomic = _construct_vrs_allele( - post_mapped_hgvs_strings, - AnnotationLayer.GENOMIC, - None, - False, - ) - if pre_mapped_genomic and post_mapped_genomic: + # genomic post-mapped variants are skipped if they don't fall within aligned region of target sequence + # so we can have a pre-mapped genomic variant without a post-mapped genomic variant + if post_mapped_hgvs_strings: + post_mapped_genomic = _construct_vrs_allele( + post_mapped_hgvs_strings, + AnnotationLayer.GENOMIC, + None, + False, + ) + else: + # TODO add error to MappedScore + post_mapped_genomic = None + + if pre_mapped_genomic: return MappedScore( accession_id=row.accession, score=row.score, From 5f05d8cb97b63abbae28eed30318469d7bde7970 Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Fri, 16 Aug 2024 10:00:07 -0700 Subject: [PATCH 05/16] Generate MappedScore with error message if mapping fails --- src/dcd_mapping/annotate.py | 35 ++++-- src/dcd_mapping/schemas.py | 22 ++-- src/dcd_mapping/vrs_map.py | 236 ++++++++++++++++++++---------------- 3 files changed, 172 insertions(+), 121 deletions(-) diff --git a/src/dcd_mapping/annotate.py b/src/dcd_mapping/annotate.py index 24bb93a..cd07624 100644 --- a/src/dcd_mapping/annotate.py +++ b/src/dcd_mapping/annotate.py @@ -288,24 +288,27 @@ def _annotate_allele_mapping( mavedb_id=mapped_score.accession_id, score=float(mapped_score.score) if mapped_score.score else None, annotation_layer=mapped_score.annotation_layer, + error_message=mapped_score.error_message + if mapped_score.error_message + else None, # TODO might not need if statement here ) def _annotate_haplotype_mapping( - mapping: MappedScore, + mapped_score: MappedScore, tx_results: TxSelectResult | None, metadata: ScoresetMetadata, vrs_version: VrsVersion = VrsVersion.V_2, ) -> ScoreAnnotationWithLayer: """Perform annotations and, if necessary, create VRS 1.3 equivalents for haplotype mappings.""" - pre_mapped: Haplotype = mapping.pre_mapped # type: ignore - post_mapped: Haplotype = mapping.post_mapped # type: ignore + pre_mapped: Haplotype = mapped_score.pre_mapped # type: ignore + post_mapped: Haplotype = mapped_score.post_mapped # type: ignore # get vrs_ref_allele_seq for pre-mapped variants for allele in pre_mapped.members: allele.extensions = [_get_vrs_ref_allele_seq(allele, metadata, tx_results)] # Determine reference sequence - if mapping.annotation_layer == AnnotationLayer.GENOMIC: + if mapped_score.annotation_layer == AnnotationLayer.GENOMIC: sequence_id = ( f"ga4gh:{post_mapped.members[0].location.sequenceReference.refgetAccession}" ) @@ -339,9 +342,12 @@ def _annotate_haplotype_mapping( pre_mapped=pre_mapped, post_mapped=post_mapped, vrs_version=vrs_version, - mavedb_id=mapping.accession_id, - score=float(mapping.score) if mapping.score is not None else None, - annotation_layer=mapping.annotation_layer, + mavedb_id=mapped_score.accession_id, + score=float(mapped_score.score) if mapped_score.score is not None else None, + annotation_layer=mapped_score.annotation_layer, + error_message=mapped_score.error_message + if mapped_score.error_message + else None, # TODO might not need if statement here ) @@ -369,7 +375,15 @@ def annotate( """ score_annotations = [] for mapped_score in mapped_scores: - if isinstance(mapped_score.pre_mapped, Haplotype) and ( + if mapped_score.pre_mapped is None: + score_annotations.append( + ScoreAnnotationWithLayer( + mavedb_id=mapped_score.accession_id, + score=float(mapped_score.score) if mapped_score.score else None, + error_message=mapped_score.error_message, + ) + ) + elif isinstance(mapped_score.pre_mapped, Haplotype) and ( isinstance(mapped_score.post_mapped, Haplotype) or mapped_score.post_mapped is None ): @@ -388,6 +402,7 @@ def annotate( ) ) else: + # TODO how to combine this error message with other potential error messages? ValueError("inconsistent variant structure") return score_annotations @@ -514,7 +529,9 @@ def save_mapped_output_json( mapped_scores: list[ScoreAnnotation] = [] for m in mappings: - if m.annotation_layer in preferred_layers: + if m.pre_mapped is None: + mapped_scores.append(ScoreAnnotation(**m.model_dump())) + elif m.annotation_layer in preferred_layers: # drop annotation layer from mapping object mapped_scores.append(ScoreAnnotation(**m.model_dump())) diff --git a/src/dcd_mapping/schemas.py b/src/dcd_mapping/schemas.py index 96e0619..c9f86fd 100644 --- a/src/dcd_mapping/schemas.py +++ b/src/dcd_mapping/schemas.py @@ -152,10 +152,11 @@ class MappedScore(BaseModel): model_config = ConfigDict(use_enum_values=True) accession_id: StrictStr - annotation_layer: AnnotationLayer score: str | None - pre_mapped: Allele | Haplotype - post_mapped: Allele | Haplotype | None + annotation_layer: AnnotationLayer | None = None + pre_mapped: Allele | Haplotype | None = None + post_mapped: Allele | Haplotype | None = None + error_message: str | None = None class ScoreAnnotation(BaseModel): @@ -164,12 +165,15 @@ class ScoreAnnotation(BaseModel): This model defines what an individual mapping instance looks like in the final JSON. """ - pre_mapped: vrs_v1_schemas.VariationDescriptor | vrs_v1_schemas.Haplotype | Allele | Haplotype - post_mapped: vrs_v1_schemas.VariationDescriptor | vrs_v1_schemas.Haplotype | Allele | Haplotype | None - vrs_version: VrsVersion mavedb_id: StrictStr - relation: Literal["SO:is_homologous_to"] = "SO:is_homologous_to" - score: float | None + relation: Literal[ + "SO:is_homologous_to" + ] = "SO:is_homologous_to" # TODO this should probably be None if pre_mapped is false? + pre_mapped: vrs_v1_schemas.VariationDescriptor | vrs_v1_schemas.Haplotype | Allele | Haplotype | None = None + post_mapped: vrs_v1_schemas.VariationDescriptor | vrs_v1_schemas.Haplotype | Allele | Haplotype | None = None + vrs_version: VrsVersion | None = None + score: float | None = None + error_message: str | None = None class ScoreAnnotationWithLayer(ScoreAnnotation): @@ -179,7 +183,7 @@ class ScoreAnnotationWithLayer(ScoreAnnotation): Used for filtering individual annotations just before saving the final JSON product. """ - annotation_layer: AnnotationLayer + annotation_layer: AnnotationLayer | None = None class ScoresetMapping(BaseModel): diff --git a/src/dcd_mapping/vrs_map.py b/src/dcd_mapping/vrs_map.py index 9e7d728..be68b92 100644 --- a/src/dcd_mapping/vrs_map.py +++ b/src/dcd_mapping/vrs_map.py @@ -166,16 +166,9 @@ def _create_post_mapped_hgvs_strings( assert alignment # noqa: S101. mypy help variant = _adjust_genomic_variant_to_ref(variant, alignment) - if variant: - hgvs_strings.append( - get_chromosome_identifier(alignment.chrom) + ":" + str(variant) - ) - else: - _logger.warning( - "Cannot process variant %s because it is not fully contained within the aligned portion of the query sequence.", - variant, - ) - return [] + hgvs_strings.append( + get_chromosome_identifier(alignment.chrom) + ":" + str(variant) + ) else: msg = ( f"Could not generate HGVS strings for invalid AnnotationLayer: {layer}" @@ -201,7 +194,7 @@ def _adjust_protein_variant_to_ref( def _adjust_genomic_variant_to_ref( variant: Variant, alignment: AlignmentResult, -) -> Variant | None: +) -> Variant: """Adjust a variant relative to a provided alignment. :param variant: A variant object relative to a scoreset's target sequence @@ -233,7 +226,11 @@ def _adjust_genomic_variant_to_ref( break if query_subrange_containing_hit is None or target_subrange_containing_hit is None: - return None + msg = ( + f"Cannot process variant {variant} because it is not fully contained \ + within the aligned portion of the query sequence", + ) + raise ValueError(msg) for idx, start in enumerate(starts): if alignment.strand is Strand.POSITIVE: @@ -298,7 +295,7 @@ def _map_protein_coding_pro( row: ScoreRow, sequence_id: str, transcript: TxSelectResult, -) -> MappedScore | None: +) -> MappedScore: """Construct VRS object mapping for ``hgvs_pro`` variant column entry These arguments are a little lazy and could be pruned down later @@ -308,15 +305,27 @@ def _map_protein_coding_pro( :param transcript: The transcript selection information for a score set :return: VRS mapping object if mapping succeeds """ - if ( - row.hgvs_pro in {"_wt", "_sy", "NA"} - or "fs" in row.hgvs_pro - or len(row.hgvs_pro) == 3 - ): + if row.hgvs_pro in {"_wt", "_sy", "NA"} or len(row.hgvs_pro) == 3: _logger.warning( "Can't process variant syntax %s for %s", row.hgvs_pro, row.accession ) - return None + return MappedScore( + accession_id=row.accession, + score=row.score, + error_message="Can't process variant syntax", + ) + + if "fs" in row.hgvs_pro: + _logger.warning( + "Can't process variant syntax %s for %s because protein frameshift variants are not supported", + row.hgvs_pro, + row.accession, + ) + return MappedScore( + accession_id=row.accession, + score=row.score, + error_message="Protein frameshift variants are not supported", + ) # TODO: Handle edge cases without hardcoding URNs. # Special case for experiment set urn:mavedb:0000097 @@ -330,47 +339,68 @@ def _map_protein_coding_pro( post_mapped=vrs_variation, ) - pre_mapped_hgvs_strings = _create_pre_mapped_hgvs_strings( - row.hgvs_pro, - AnnotationLayer.PROTEIN, - tx=transcript, - ) - post_mapped_hgvs_strings = _create_post_mapped_hgvs_strings( - row.hgvs_pro, - AnnotationLayer.PROTEIN, - tx=transcript, - ) - - pre_mapped_protein = _construct_vrs_allele( - pre_mapped_hgvs_strings, - AnnotationLayer.PROTEIN, - sequence_id, - True, - ) - post_mapped_protein = _construct_vrs_allele( - post_mapped_hgvs_strings, - AnnotationLayer.PROTEIN, - None, - False, - ) + try: + pre_mapped_hgvs_strings = _create_pre_mapped_hgvs_strings( + row.hgvs_pro, + AnnotationLayer.PROTEIN, + tx=transcript, + ) + pre_mapped_protein = _construct_vrs_allele( + pre_mapped_hgvs_strings, + AnnotationLayer.PROTEIN, + sequence_id, + True, + ) + except Exception as e: + _logger.warning( + "An error occurred while generating pre-mapped protein variant for %s, accession %s: %s", + row.hgvs_pro, + row.accession, + e, + ) + return MappedScore(accession_id=row.accession, score=row.score, error_message=e) - if pre_mapped_protein: + try: + post_mapped_hgvs_strings = _create_post_mapped_hgvs_strings( + row.hgvs_pro, + AnnotationLayer.PROTEIN, + tx=transcript, + ) + post_mapped_protein = _construct_vrs_allele( + post_mapped_hgvs_strings, + AnnotationLayer.PROTEIN, + None, + False, + ) + except Exception as e: + _logger.warning( + "An error occurred while generating post-mapped protein variant for %s, accession %s: %s", + row.hgvs_pro, + row.accession, + e, + ) return MappedScore( accession_id=row.accession, score=row.score, annotation_layer=AnnotationLayer.PROTEIN, pre_mapped=pre_mapped_protein, - post_mapped=post_mapped_protein, + error_message=e, ) - return None + return MappedScore( + accession_id=row.accession, + score=row.score, + annotation_layer=AnnotationLayer.PROTEIN, + pre_mapped=pre_mapped_protein, + post_mapped=post_mapped_protein, + ) def _map_genomic( row: ScoreRow, sequence_id: str, align_result: AlignmentResult, -) -> MappedScore | None: +) -> MappedScore: """Construct VRS object mapping for ``hgvs_nt`` variant column entry These arguments are a little lazy and could be pruned down later @@ -380,47 +410,76 @@ def _map_genomic( :param align_result: The transcript selection information for a score set :return: VRS mapping object if mapping succeeds """ - pre_mapped_hgvs_strings = _create_pre_mapped_hgvs_strings( - row.hgvs_nt, - AnnotationLayer.GENOMIC, - alignment=align_result, - ) - post_mapped_hgvs_strings = _create_post_mapped_hgvs_strings( - row.hgvs_nt, - AnnotationLayer.GENOMIC, - alignment=align_result, - ) + if ( + row.hgvs_nt in {"_wt", "_sy", "="} + or "fs" + in row.hgvs_nt # TODO I think this line can be taken out, I don't think fs nomenclature can be used for nt anyway + or len(row.hgvs_nt) == 3 + ): + _logger.warning( + "Can't process variant syntax %s for %s", row.hgvs_nt, row.accession + ) + return MappedScore( + accession_id=row.accession, + score=row.score, + error_message="Can't process variant syntax", + ) - pre_mapped_genomic = _construct_vrs_allele( - pre_mapped_hgvs_strings, - AnnotationLayer.GENOMIC, - sequence_id, - True, - ) + try: + pre_mapped_hgvs_strings = _create_pre_mapped_hgvs_strings( + row.hgvs_nt, + AnnotationLayer.GENOMIC, + alignment=align_result, + ) + pre_mapped_genomic = _construct_vrs_allele( + pre_mapped_hgvs_strings, + AnnotationLayer.GENOMIC, + sequence_id, + True, + ) + except Exception as e: + _logger.warning( + "An error occurred while generating pre-mapped genomic variant for %s, accession %s: %s", + row.hgvs_nt, + row.accession, + e, + ) + return MappedScore(accession_id=row.accession, score=row.score, error_message=e) - # genomic post-mapped variants are skipped if they don't fall within aligned region of target sequence - # so we can have a pre-mapped genomic variant without a post-mapped genomic variant - if post_mapped_hgvs_strings: + try: + post_mapped_hgvs_strings = _create_post_mapped_hgvs_strings( + row.hgvs_nt, + AnnotationLayer.GENOMIC, + alignment=align_result, + ) post_mapped_genomic = _construct_vrs_allele( post_mapped_hgvs_strings, AnnotationLayer.GENOMIC, None, False, ) - else: - # TODO add error to MappedScore - post_mapped_genomic = None - - if pre_mapped_genomic: + except Exception as e: + _logger.warning( + "An error occurred while generating post-mapped genomic variant for %s, accession %s: %s", + row.hgvs_nt, + row.accession, + e, + ) return MappedScore( accession_id=row.accession, score=row.score, annotation_layer=AnnotationLayer.GENOMIC, pre_mapped=pre_mapped_genomic, - post_mapped=post_mapped_genomic, + error_message=e, ) - return None + return MappedScore( + accession_id=row.accession, + score=row.score, + annotation_layer=AnnotationLayer.GENOMIC, + pre_mapped=pre_mapped_genomic, + post_mapped=post_mapped_genomic, + ) def _get_allele_sequence(allele: Allele) -> str: @@ -496,24 +555,12 @@ def _map_protein_coding( hgvs_pro_mappings = _map_protein_coding_pro(row, psequence_id, transcript) if hgvs_pro_mappings: variations.append(hgvs_pro_mappings) - else: - _logger.warning( - "Encountered apparently invalid protein variants in %s: %s", - row.accession, - row.hgvs_pro, - ) if _hgvs_nt_is_valid(row.hgvs_nt): hgvs_nt_mappings = _map_genomic(row, gsequence_id, align_result) if hgvs_nt_mappings: variations.append(hgvs_nt_mappings) - else: - _logger.warning( - "Encountered apparently invalid genomic variants in %s: %s", - row.accession, - row.hgvs_nt, - ) return variations @@ -534,26 +581,8 @@ def _map_regulatory_noncoding( sequence_id = store_sequence(metadata.target_sequence) for row in records: - if ( - row.hgvs_nt in {"_wt", "_sy", "="} - or "fs" in row.hgvs_nt - or len(row.hgvs_nt) == 3 - ): - _logger.warning( - "Can't process variant syntax %s for %s", row.hgvs_nt, metadata.urn - ) - continue - hgvs_nt_mappings = _map_genomic(row, sequence_id, align_result) - - if hgvs_nt_mappings: - variations.append(hgvs_nt_mappings) - else: - _logger.warning( - "Encountered apparently invalid genomic variants in %s: %s", - row.accession, - row.hgvs_nt, - ) + variations.append(hgvs_nt_mappings) return variations @@ -646,6 +675,7 @@ def vrs_map( :param silent: If true, suppress console output :return: A list of mapping results """ + # TODO address this hardcoding, and if we keep it, this should be a score set mapping error message if metadata.urn == "urn:mavedb:00000072-a-1": msg = f"No RefSeq accession is available for {metadata.urn}." if not silent: From 5ce5a0956893e4283d82d675f46ba439a1b71b55 Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Fri, 16 Aug 2024 15:46:40 -0700 Subject: [PATCH 06/16] fixup mapped error messages --- src/dcd_mapping/vrs_map.py | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/src/dcd_mapping/vrs_map.py b/src/dcd_mapping/vrs_map.py index be68b92..9fb816b 100644 --- a/src/dcd_mapping/vrs_map.py +++ b/src/dcd_mapping/vrs_map.py @@ -227,8 +227,8 @@ def _adjust_genomic_variant_to_ref( if query_subrange_containing_hit is None or target_subrange_containing_hit is None: msg = ( - f"Cannot process variant {variant} because it is not fully contained \ - within the aligned portion of the query sequence", + f"Cannot process variant {variant} because it is not fully contained " + + "within the aligned portion of the query sequence" ) raise ValueError(msg) @@ -312,7 +312,7 @@ def _map_protein_coding_pro( return MappedScore( accession_id=row.accession, score=row.score, - error_message="Can't process variant syntax", + error_message=f"Can't process variant syntax {row.hgvs_pro}", ) if "fs" in row.hgvs_pro: @@ -356,9 +356,11 @@ def _map_protein_coding_pro( "An error occurred while generating pre-mapped protein variant for %s, accession %s: %s", row.hgvs_pro, row.accession, - e, + str(e), + ) + return MappedScore( + accession_id=row.accession, score=row.score, error_message=str(e) ) - return MappedScore(accession_id=row.accession, score=row.score, error_message=e) try: post_mapped_hgvs_strings = _create_post_mapped_hgvs_strings( @@ -377,14 +379,14 @@ def _map_protein_coding_pro( "An error occurred while generating post-mapped protein variant for %s, accession %s: %s", row.hgvs_pro, row.accession, - e, + str(e), ) return MappedScore( accession_id=row.accession, score=row.score, annotation_layer=AnnotationLayer.PROTEIN, pre_mapped=pre_mapped_protein, - error_message=e, + error_message=str(e), ) return MappedScore( @@ -422,7 +424,7 @@ def _map_genomic( return MappedScore( accession_id=row.accession, score=row.score, - error_message="Can't process variant syntax", + error_message=f"Can't process variant syntax {row.hgvs_nt}", ) try: @@ -442,9 +444,11 @@ def _map_genomic( "An error occurred while generating pre-mapped genomic variant for %s, accession %s: %s", row.hgvs_nt, row.accession, - e, + str(e), + ) + return MappedScore( + accession_id=row.accession, score=row.score, error_message=str(e) ) - return MappedScore(accession_id=row.accession, score=row.score, error_message=e) try: post_mapped_hgvs_strings = _create_post_mapped_hgvs_strings( @@ -463,14 +467,14 @@ def _map_genomic( "An error occurred while generating post-mapped genomic variant for %s, accession %s: %s", row.hgvs_nt, row.accession, - e, + str(e), ) return MappedScore( accession_id=row.accession, score=row.score, annotation_layer=AnnotationLayer.GENOMIC, pre_mapped=pre_mapped_genomic, - error_message=e, + error_message=str(e), ) return MappedScore( From 3e771d5440e917d871ed7bd8537331d068e0b782 Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Wed, 21 Aug 2024 09:32:53 -0700 Subject: [PATCH 07/16] Add score set-based error handling --- src/dcd_mapping/annotate.py | 48 ++++++++++---- src/dcd_mapping/lookup.py | 15 +++-- src/dcd_mapping/main.py | 111 ++++++++++++++++++++++++++++----- src/dcd_mapping/mavedb_data.py | 12 ++-- src/dcd_mapping/schemas.py | 13 ++-- src/dcd_mapping/vrs_map.py | 20 +++--- 6 files changed, 166 insertions(+), 53 deletions(-) diff --git a/src/dcd_mapping/annotate.py b/src/dcd_mapping/annotate.py index cd07624..769c30f 100644 --- a/src/dcd_mapping/annotate.py +++ b/src/dcd_mapping/annotate.py @@ -451,6 +451,9 @@ def _get_mapped_reference_sequence( :return A MappedReferenceSequence object """ if layer == AnnotationLayer.PROTEIN and tx_output is not None: + if tx_output.np is None: + msg = "No NP accession associated with reference transcript" + raise ValueError(msg) vrs_id = get_vrs_id_from_identifier(tx_output.np) if vrs_id is None: raise ValueError @@ -488,6 +491,28 @@ def _set_scoreset_layer( return AnnotationLayer.PROTEIN +def write_scoreset_mapping_to_json( + urn: str, + scoreset_mapping: ScoresetMapping, + output_path: Path | None = None, +) -> Path: + """Write the given ScoresetMapping as a JSON at the specified + or default ouput path. + """ + if not output_path: + now = datetime.datetime.now(tz=datetime.UTC).isoformat() + output_path = LOCAL_STORE_PATH / f"{urn}_mapping_{now}.json" + + _logger.info("Saving mapping output to %s", output_path) + with output_path.open("w") as file: + json.dump( + scoreset_mapping.model_dump(exclude_unset=True, exclude_none=True), + file, + indent=4, + ) + return output_path + + def save_mapped_output_json( urn: str, mappings: list[ScoreAnnotationWithLayer], @@ -526,6 +551,16 @@ def save_mapped_output_json( reference_sequences[layer][ "mapped_reference_sequence" ] = _get_mapped_reference_sequence(layer, tx_output, align_result) + # except Exception as e: + # _logger.warning( + # str(e) + # ) + # output = ScoresetMapping( + # metadata=metadata, + # error_message = str(e).strip("'") + # ) + + # return write_scoreset_mapping_to_json mapped_scores: list[ScoreAnnotation] = [] for m in mappings: @@ -552,15 +587,4 @@ def save_mapped_output_json( mapped_scores=mapped_scores, ) - if not output_path: - now = datetime.datetime.now(tz=datetime.UTC).isoformat() - output_path = LOCAL_STORE_PATH / f"{urn}_mapping_{now}.json" - - _logger.info("Saving mapping output to %s", output_path) - with output_path.open("w") as file: - json.dump( - output.model_dump(exclude_unset=True, exclude_none=True), - file, - indent=4, - ) - return output_path + return write_scoreset_mapping_to_json(urn, output, output_path) diff --git a/src/dcd_mapping/lookup.py b/src/dcd_mapping/lookup.py index 42f80fb..74472a3 100644 --- a/src/dcd_mapping/lookup.py +++ b/src/dcd_mapping/lookup.py @@ -428,7 +428,8 @@ def get_chromosome_identifier(chromosome: str) -> str: for ac in tmp_acs: acs.append(ac.split("refseq:")[-1]) if not acs: - raise KeyError + msg = f"Cannot retrieve NC identifier for {chromosome} from Seqrepo" + raise KeyError(msg) # make sure e.g. version .10 > version .9 sorted_results = sorted(acs, key=lambda i: int(i.split(".")[-1])) @@ -446,7 +447,10 @@ def get_ucsc_chromosome_name(chromosome: str) -> str: sr = CoolSeqToolBuilder().seqrepo_access result, _ = sr.translate_identifier(chromosome, "GRCh38") if not result: - raise KeyError + msg = ( + f"Cannot retrieve USCS-style chromosome name for {chromosome} from Seqrepo" + ) + raise KeyError(msg) sorted_results = sorted([r for r in result if "chr" in r]) try: @@ -465,7 +469,8 @@ def get_chromosome_identifier_from_vrs_id(sequence_id: str) -> str | None: sr = CoolSeqToolBuilder().seqrepo_access result, _ = sr.translate_identifier(sequence_id, "refseq") if not result: - raise KeyError + msg = f"Cannot retrieve NC identifier for {sequence_id} from Seqrepo" + raise KeyError(msg) sorted_results = sorted(result) return sorted_results[-1] @@ -479,8 +484,8 @@ def get_vrs_id_from_identifier(sequence_id: str) -> str | None: sr = CoolSeqToolBuilder().seqrepo_access result, _ = sr.translate_identifier(sequence_id, "ga4gh") if not result: - raise KeyError - + msg = f"Cannot retrieve GA4GH SQ identifier for {sequence_id} from Seqrepo" + raise KeyError(msg) sorted_results = sorted(result) return sorted_results[-1] diff --git a/src/dcd_mapping/main.py b/src/dcd_mapping/main.py index 201f1b1..e6f2852 100644 --- a/src/dcd_mapping/main.py +++ b/src/dcd_mapping/main.py @@ -8,12 +8,17 @@ import click from dcd_mapping.align import AlignmentError, BlatNotFoundError, align -from dcd_mapping.annotate import annotate, save_mapped_output_json +from dcd_mapping.annotate import ( + annotate, + save_mapped_output_json, + write_scoreset_mapping_to_json, +) from dcd_mapping.lookup import check_gene_normalizer, check_seqrepo, check_uta from dcd_mapping.mavedb_data import get_scoreset_metadata, get_scoreset_records from dcd_mapping.resource_utils import ResourceAcquisitionError from dcd_mapping.schemas import ( ScoreRow, + ScoresetMapping, ScoresetMetadata, VrsVersion, ) @@ -149,7 +154,14 @@ async def map_scoreset( _emit_info( f"Alignment failed for scoreset {metadata.urn} {e}", silent, logging.ERROR ) - raise e + final_output = write_scoreset_mapping_to_json( + metadata.urn, + ScoresetMapping(metadata=metadata, error_message=str(e).strip("'")), + output_path, + ) + _emit_info(f"Score set mapping output saved to: {final_output}.", silent) + # raise e TODO do we still want to raise an error in this case, after writing the file? + return _emit_info("Alignment complete.", silent) _emit_info("Selecting reference sequence...", silent) @@ -161,7 +173,14 @@ async def map_scoreset( silent, logging.ERROR, ) - raise e + final_output = write_scoreset_mapping_to_json( + metadata.urn, + ScoresetMapping(metadata=metadata, error_message=str(e).strip("'")), + output_path, + ) + _emit_info(f"Score set mapping output saved to: {final_output}.", silent) + # raise e TODO do we still want to raise an error in this case, after writing the file? + return _emit_info("Reference selection complete.", silent) _emit_info("Mapping to VRS...", silent) @@ -171,22 +190,79 @@ async def map_scoreset( _emit_info( f"VRS mapping failed for scoreset {metadata.urn}", silent, logging.ERROR ) - raise e + final_output = write_scoreset_mapping_to_json( + metadata.urn, + ScoresetMapping(metadata=metadata, error_message=str(e).strip("'")), + output_path, + ) + _emit_info(f"Score set mapping output saved to: {final_output}.", silent) + # raise e TODO do we still want to raise an error in this case, after writing the file? + return if vrs_results is None: _emit_info(f"No mapping available for {metadata.urn}", silent) + final_output = write_scoreset_mapping_to_json( + metadata.urn, + ScoresetMapping( + metadata=metadata, + error_message="No variant mappings available for this score set", + ), + output_path, + ) + _emit_info(f"Score set mapping output saved to: {final_output}.", silent) + # raise e TODO do we still want to raise an error in this case, after writing the file? return _emit_info("VRS mapping complete.", silent) _emit_info("Annotating metadata and saving to file...", silent) - vrs_results = annotate(vrs_results, transcript, metadata, vrs_version) - final_output = save_mapped_output_json( - metadata.urn, - vrs_results, - alignment_result, - transcript, - prefer_genomic, - output_path, - ) + try: + vrs_results = annotate(vrs_results, transcript, metadata, vrs_version) + except Exception as e: # TODO create AnnotationError class and replace ValueErrors in annotation steps with AnnotationErrors + _emit_info( + f"VRS annotation failed for scoreset {metadata.urn}", silent, logging.ERROR + ) + final_output = write_scoreset_mapping_to_json( + metadata.urn, + ScoresetMapping(metadata=metadata, error_message=str(e).strip("'")), + output_path, + ) + _emit_info(f"Score set mapping output saved to: {final_output}.", silent) + # raise e TODO do we still want to raise an error in this case, after writing the file? + return + if vrs_results is None: + _emit_info(f"No annotation available for {metadata.urn}", silent) + final_output = write_scoreset_mapping_to_json( + metadata.urn, + ScoresetMapping( + metadata=metadata, + error_message="No annotated variant mappings available for this score set", + ), + output_path, + ) + _emit_info(f"Score set mapping output saved to: {final_output}.", silent) + # raise e TODO do we still want to raise an error in this case, after writing the file? + return + try: + final_output = save_mapped_output_json( + metadata.urn, + vrs_results, + alignment_result, + transcript, + prefer_genomic, + output_path, + ) + except Exception as e: + _emit_info( + f"Error in creating or saving final score set mapping for {metadata.urn} {e}", + silent, + ) + final_output = write_scoreset_mapping_to_json( + metadata.urn, + ScoresetMapping(metadata=metadata, error_message=str(e).strip("'")), + output_path, + ) + _emit_info(f"Score set mapping output saved to: {final_output}.", silent) + # raise e TODO do we still want to raise an error in this case, after writing the file? + return _emit_info(f"Annotated scores saved to: {final_output}.", silent) @@ -211,7 +287,14 @@ async def map_scoreset_urn( msg = f"Unable to acquire resource from MaveDB: {e}" _logger.critical(msg) click.echo(f"Error: {msg}") - raise e + final_output = write_scoreset_mapping_to_json( + urn, + ScoresetMapping(metadata=None, error_message=str(e).strip("'")), + output_path, + ) + _emit_info(f"Score set mapping output saved to: {final_output}.", silent) + # raise e TODO do we still want to raise an error in this case, after writing the file? + return await map_scoreset( metadata, records, output_path, vrs_version, prefer_genomic, silent ) diff --git a/src/dcd_mapping/mavedb_data.py b/src/dcd_mapping/mavedb_data.py index e858dc4..06da30a 100644 --- a/src/dcd_mapping/mavedb_data.py +++ b/src/dcd_mapping/mavedb_data.py @@ -156,13 +156,17 @@ def get_scoreset_metadata( """ metadata = get_raw_scoreset_metadata(scoreset_urn, dcd_mapping_dir) - if not _metadata_response_is_human(metadata): - msg = f"Experiment for {scoreset_urn} contains no human targets" - raise ResourceAcquisitionError(msg) if len(metadata["targetGenes"]) > 1: - msg = f"Multiple target genes for {scoreset_urn} -- look into this." + msg = f"Multiple target genes for {scoreset_urn}. Multi-target score sets are not currently supported." raise ResourceAcquisitionError(msg) gene = metadata["targetGenes"][0] + target_sequence_gene = gene.get("targetSequence") + if target_sequence_gene is None: + msg = f"No target sequence available for {scoreset_urn}. Accession-based score sets are not currently supported." + raise ResourceAcquisitionError(msg) + if not _metadata_response_is_human(metadata): + msg = f"Experiment for {scoreset_urn} contains no human targets" + raise ResourceAcquisitionError(msg) try: structured_data = ScoresetMetadata( urn=metadata["urn"], diff --git a/src/dcd_mapping/schemas.py b/src/dcd_mapping/schemas.py index c9f86fd..d2a7825 100644 --- a/src/dcd_mapping/schemas.py +++ b/src/dcd_mapping/schemas.py @@ -189,9 +189,10 @@ class ScoreAnnotationWithLayer(ScoreAnnotation): class ScoresetMapping(BaseModel): """Provide all mapped scores for a scoreset.""" - metadata: Any # TODO get exact MaveDB metadata structure? - computed_protein_reference_sequence: ComputedReferenceSequence | None - mapped_protein_reference_sequence: MappedReferenceSequence | None - computed_genomic_reference_sequence: ComputedReferenceSequence | None - mapped_genomic_reference_sequence: MappedReferenceSequence | None - mapped_scores: list[ScoreAnnotation] + metadata: Any # TODO get exact MaveDB metadata structure? | None + computed_protein_reference_sequence: ComputedReferenceSequence | None = None + mapped_protein_reference_sequence: MappedReferenceSequence | None = None + computed_genomic_reference_sequence: ComputedReferenceSequence | None = None + mapped_genomic_reference_sequence: MappedReferenceSequence | None = None + mapped_scores: list[ScoreAnnotation] | None = None + error_message: str | None = None diff --git a/src/dcd_mapping/vrs_map.py b/src/dcd_mapping/vrs_map.py index 9fb816b..28d5ab7 100644 --- a/src/dcd_mapping/vrs_map.py +++ b/src/dcd_mapping/vrs_map.py @@ -4,7 +4,6 @@ from collections.abc import Iterable from itertools import cycle -import click from Bio.Seq import Seq from cool_seq_tool.schemas import AnnotationLayer, Strand from ga4gh.core import ga4gh_identify, sha512t24u @@ -226,10 +225,7 @@ def _adjust_genomic_variant_to_ref( break if query_subrange_containing_hit is None or target_subrange_containing_hit is None: - msg = ( - f"Cannot process variant {variant} because it is not fully contained " - + "within the aligned portion of the query sequence" - ) + msg = f"Cannot process variant {variant} because it is not fully contained within the aligned portion of the query sequence" raise ValueError(msg) for idx, start in enumerate(starts): @@ -386,7 +382,7 @@ def _map_protein_coding_pro( score=row.score, annotation_layer=AnnotationLayer.PROTEIN, pre_mapped=pre_mapped_protein, - error_message=str(e), + error_message=str(e).strip("'"), ) return MappedScore( @@ -680,12 +676,12 @@ def vrs_map( :return: A list of mapping results """ # TODO address this hardcoding, and if we keep it, this should be a score set mapping error message - if metadata.urn == "urn:mavedb:00000072-a-1": - msg = f"No RefSeq accession is available for {metadata.urn}." - if not silent: - click.echo(msg) - _logger.warning(msg) - return None + # if metadata.urn == "urn:mavedb:00000072-a-1": + # msg = f"No RefSeq accession is available for {metadata.urn}." + # if not silent: + # click.echo(msg) + # _logger.warning(msg) + # return None if metadata.target_gene_category == TargetType.PROTEIN_CODING and transcript: return _map_protein_coding( From 7a7a009fa9c1f194dfae2a3e075358bdf16bdd19 Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Thu, 22 Aug 2024 09:21:26 -0700 Subject: [PATCH 08/16] Initialize versioning, add date and version to output --- src/dcd_mapping/__init__.py | 2 ++ src/dcd_mapping/annotate.py | 2 +- src/dcd_mapping/main.py | 1 + src/dcd_mapping/schemas.py | 6 +++++- src/dcd_mapping/version.py | 3 +++ 5 files changed, 12 insertions(+), 2 deletions(-) create mode 100644 src/dcd_mapping/version.py diff --git a/src/dcd_mapping/__init__.py b/src/dcd_mapping/__init__.py index 5525c98..b571764 100644 --- a/src/dcd_mapping/__init__.py +++ b/src/dcd_mapping/__init__.py @@ -8,7 +8,9 @@ from dotenv import load_dotenv from .main import map_scoreset, map_scoreset_urn +from .version import dcd_mapping_version __all__ = ["map_scoreset", "map_scoreset_urn"] +__version__ = dcd_mapping_version load_dotenv() diff --git a/src/dcd_mapping/annotate.py b/src/dcd_mapping/annotate.py index 769c30f..f594a4e 100644 --- a/src/dcd_mapping/annotate.py +++ b/src/dcd_mapping/annotate.py @@ -506,7 +506,7 @@ def write_scoreset_mapping_to_json( _logger.info("Saving mapping output to %s", output_path) with output_path.open("w") as file: json.dump( - scoreset_mapping.model_dump(exclude_unset=True, exclude_none=True), + scoreset_mapping.model_dump(exclude_unset=False, exclude_none=True), file, indent=4, ) diff --git a/src/dcd_mapping/main.py b/src/dcd_mapping/main.py index e6f2852..98a8d2f 100644 --- a/src/dcd_mapping/main.py +++ b/src/dcd_mapping/main.py @@ -181,6 +181,7 @@ async def map_scoreset( _emit_info(f"Score set mapping output saved to: {final_output}.", silent) # raise e TODO do we still want to raise an error in this case, after writing the file? return + # TODO handle UTA errors separately, don't write scoresetmapping to file _emit_info("Reference selection complete.", silent) _emit_info("Mapping to VRS...", silent) diff --git a/src/dcd_mapping/schemas.py b/src/dcd_mapping/schemas.py index d2a7825..6d5c58d 100644 --- a/src/dcd_mapping/schemas.py +++ b/src/dcd_mapping/schemas.py @@ -1,4 +1,5 @@ """Provide class definitions for commonly-used information objects.""" +import datetime from enum import Enum from typing import Any, Literal @@ -7,6 +8,7 @@ from pydantic import BaseModel, ConfigDict, StrictBool, StrictInt, StrictStr from dcd_mapping import vrs_v1_schemas +from dcd_mapping.version import dcd_mapping_version class TargetSequenceType(str, Enum): @@ -189,7 +191,9 @@ class ScoreAnnotationWithLayer(ScoreAnnotation): class ScoresetMapping(BaseModel): """Provide all mapped scores for a scoreset.""" - metadata: Any # TODO get exact MaveDB metadata structure? | None + metadata: Any # TODO get exact MaveDB metadata structure? + dcd_mapping_version: str = dcd_mapping_version + mapped_date_utc: str = datetime.datetime.now(tz=datetime.UTC).isoformat() computed_protein_reference_sequence: ComputedReferenceSequence | None = None mapped_protein_reference_sequence: MappedReferenceSequence | None = None computed_genomic_reference_sequence: ComputedReferenceSequence | None = None diff --git a/src/dcd_mapping/version.py b/src/dcd_mapping/version.py new file mode 100644 index 0000000..8806049 --- /dev/null +++ b/src/dcd_mapping/version.py @@ -0,0 +1,3 @@ +"""Provide dcd mapping version""" + +dcd_mapping_version = "2024.1.0" From 7b4dd8e05e74f50110c5b501ba91f18eb9f4f5d7 Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Mon, 26 Aug 2024 09:18:36 -0700 Subject: [PATCH 09/16] Add error handling for external data sources --- src/dcd_mapping/align.py | 13 +++---- src/dcd_mapping/lookup.py | 38 +++++++++++--------- src/dcd_mapping/main.py | 63 ++++++++++++++++++++++++---------- src/dcd_mapping/mavedb_data.py | 12 ++++--- 4 files changed, 79 insertions(+), 47 deletions(-) diff --git a/src/dcd_mapping/align.py b/src/dcd_mapping/align.py index 8d9f878..db2d3f8 100644 --- a/src/dcd_mapping/align.py +++ b/src/dcd_mapping/align.py @@ -205,6 +205,7 @@ def _get_best_hit(output: QueryResult, urn: str, chromosome: str | None) -> Hit: else: if list(output): hit_chrs = [h.id for h in output] + # TODO should this be an error rather than a warning? it seems like a problem if we can't find a hit on the expected chromosome _logger.warning( "Failed to match hit chromosomes during alignment. URN: %s, expected chromosome: %s, hit chromosomes: %s", urn, @@ -221,8 +222,8 @@ def _get_best_hit(output: QueryResult, urn: str, chromosome: str | None) -> Hit: best_score_hit = hit if best_score_hit is None: - _logger.error("Couldn't get hits from %s -- check BLAT output.", urn) - raise AlignmentError + msg = f"Couldn't get BLAT hits from {urn}" + raise AlignmentError(msg) return best_score_hit @@ -246,12 +247,8 @@ def _get_best_hsp(hit: Hit, urn: str, gene_location: GeneLocation | None) -> HSP else: best_hsp = max(hit, key=lambda hsp: hsp.score) if best_hsp is None: - _logger.error( - "Unable to get best HSP from hit -- this should be impossible? urn: %s, hit: %s", - urn, - hit, - ) - raise AlignmentError + msg = f"Unable to get best HSP from BLAT hit: {hit}" + raise AlignmentError(msg) return best_hsp diff --git a/src/dcd_mapping/lookup.py b/src/dcd_mapping/lookup.py index 74472a3..a037af8 100644 --- a/src/dcd_mapping/lookup.py +++ b/src/dcd_mapping/lookup.py @@ -212,12 +212,15 @@ async def get_protein_accession(transcript: str) -> str | None: :param transcript: transcript accession, e.g. ``"NM_002529.3"`` :return: protein accession if successful """ - uta = CoolSeqToolBuilder().uta_db - query = f""" - SELECT pro_ac FROM {uta.schema}.associated_accessions - WHERE tx_ac = '{transcript}' - """ # noqa: S608 - result = await uta.execute_query(query) + try: + uta = CoolSeqToolBuilder().uta_db + query = f""" + SELECT pro_ac FROM {uta.schema}.associated_accessions + WHERE tx_ac = '{transcript}' + """ # noqa: S608 + result = await uta.execute_query(query) + except Exception as e: + raise DataLookupError from e if result: return result[0]["pro_ac"] return None @@ -239,16 +242,19 @@ async def get_transcripts( :param end: ending position :return: candidate transcript accessions """ - uta = CoolSeqToolBuilder().uta_db - query = f""" - SELECT tx_ac - FROM {uta.schema}.tx_exon_aln_v - WHERE hgnc = '{gene_symbol}' - AND ({start} BETWEEN alt_start_i AND alt_end_i OR {end} BETWEEN alt_start_i AND alt_end_i) - AND alt_ac = '{chromosome_ac}' - AND tx_ac NOT LIKE 'NR_%'; - """ # noqa: S608 - result = await uta.execute_query(query) + try: + uta = CoolSeqToolBuilder().uta_db + query = f""" + SELECT tx_ac + FROM {uta.schema}.tx_exon_aln_v + WHERE hgnc = '{gene_symbol}' + AND ({start} BETWEEN alt_start_i AND alt_end_i OR {end} BETWEEN alt_start_i AND alt_end_i) + AND alt_ac = '{chromosome_ac}' + AND tx_ac NOT LIKE 'NR_%'; + """ # noqa: S608 + result = await uta.execute_query(query) + except Exception as e: + raise DataLookupError from e return [row["tx_ac"] for row in result] diff --git a/src/dcd_mapping/main.py b/src/dcd_mapping/main.py index 98a8d2f..a51fb82 100644 --- a/src/dcd_mapping/main.py +++ b/src/dcd_mapping/main.py @@ -6,6 +6,7 @@ from pathlib import Path import click +from requests import HTTPError from dcd_mapping.align import AlignmentError, BlatNotFoundError, align from dcd_mapping.annotate import ( @@ -13,8 +14,17 @@ save_mapped_output_json, write_scoreset_mapping_to_json, ) -from dcd_mapping.lookup import check_gene_normalizer, check_seqrepo, check_uta -from dcd_mapping.mavedb_data import get_scoreset_metadata, get_scoreset_records +from dcd_mapping.lookup import ( + DataLookupError, + check_gene_normalizer, + check_seqrepo, + check_uta, +) +from dcd_mapping.mavedb_data import ( + ScoresetNotSupportedError, + get_scoreset_metadata, + get_scoreset_records, +) from dcd_mapping.resource_utils import ResourceAcquisitionError from dcd_mapping.schemas import ( ScoreRow, @@ -150,6 +160,9 @@ async def map_scoreset( msg = "BLAT command appears missing. Ensure it is available on the $PATH or use the environment variable BLAT_BIN_PATH to point to it. See instructions in the README prerequisites section for more." _emit_info(msg, silent, logging.ERROR) raise e + except ResourceAcquisitionError as e: + _emit_info(f"BLAT resource could not be acquired: {e}", silent, logging.ERROR) + raise e except AlignmentError as e: _emit_info( f"Alignment failed for scoreset {metadata.urn} {e}", silent, logging.ERROR @@ -160,14 +173,13 @@ async def map_scoreset( output_path, ) _emit_info(f"Score set mapping output saved to: {final_output}.", silent) - # raise e TODO do we still want to raise an error in this case, after writing the file? return _emit_info("Alignment complete.", silent) _emit_info("Selecting reference sequence...", silent) try: transcript = await select_transcript(metadata, records, alignment_result) - except TxSelectError as e: + except (TxSelectError, KeyError, ValueError) as e: _emit_info( f"Transcript selection failed for scoreset {metadata.urn}", silent, @@ -179,9 +191,21 @@ async def map_scoreset( output_path, ) _emit_info(f"Score set mapping output saved to: {final_output}.", silent) - # raise e TODO do we still want to raise an error in this case, after writing the file? return - # TODO handle UTA errors separately, don't write scoresetmapping to file + except HTTPError as e: + _emit_info( + f"HTTP error occurred during transcript selection: {e}", + silent, + logging.ERROR, + ) + raise e + except DataLookupError as e: + _emit_info( + f"Data lookup error occurred during transcript selection: {e}", + silent, + logging.ERROR, + ) + raise e _emit_info("Reference selection complete.", silent) _emit_info("Mapping to VRS...", silent) @@ -197,10 +221,9 @@ async def map_scoreset( output_path, ) _emit_info(f"Score set mapping output saved to: {final_output}.", silent) - # raise e TODO do we still want to raise an error in this case, after writing the file? return if vrs_results is None: - _emit_info(f"No mapping available for {metadata.urn}", silent) + _emit_info(f"No mapping available for {metadata.urn}", silent, logging.ERROR) final_output = write_scoreset_mapping_to_json( metadata.urn, ScoresetMapping( @@ -210,7 +233,6 @@ async def map_scoreset( output_path, ) _emit_info(f"Score set mapping output saved to: {final_output}.", silent) - # raise e TODO do we still want to raise an error in this case, after writing the file? return _emit_info("VRS mapping complete.", silent) @@ -227,10 +249,9 @@ async def map_scoreset( output_path, ) _emit_info(f"Score set mapping output saved to: {final_output}.", silent) - # raise e TODO do we still want to raise an error in this case, after writing the file? return if vrs_results is None: - _emit_info(f"No annotation available for {metadata.urn}", silent) + _emit_info(f"No annotation available for {metadata.urn}", silent, logging.ERROR) final_output = write_scoreset_mapping_to_json( metadata.urn, ScoresetMapping( @@ -240,7 +261,6 @@ async def map_scoreset( output_path, ) _emit_info(f"Score set mapping output saved to: {final_output}.", silent) - # raise e TODO do we still want to raise an error in this case, after writing the file? return try: final_output = save_mapped_output_json( @@ -255,6 +275,7 @@ async def map_scoreset( _emit_info( f"Error in creating or saving final score set mapping for {metadata.urn} {e}", silent, + logging.ERROR, ) final_output = write_scoreset_mapping_to_json( metadata.urn, @@ -262,7 +283,6 @@ async def map_scoreset( output_path, ) _emit_info(f"Score set mapping output saved to: {final_output}.", silent) - # raise e TODO do we still want to raise an error in this case, after writing the file? return _emit_info(f"Annotated scores saved to: {final_output}.", silent) @@ -284,18 +304,23 @@ async def map_scoreset_urn( try: metadata = get_scoreset_metadata(urn) records = get_scoreset_records(urn, silent) - except ResourceAcquisitionError as e: - msg = f"Unable to acquire resource from MaveDB: {e}" - _logger.critical(msg) - click.echo(f"Error: {msg}") + except ScoresetNotSupportedError as e: + _emit_info(f"Score set not supported: {e}", silent, logging.ERROR) final_output = write_scoreset_mapping_to_json( urn, - ScoresetMapping(metadata=None, error_message=str(e).strip("'")), + ScoresetMapping( + metadata=None, + error_message=str(e).strip("'"), + ), output_path, ) _emit_info(f"Score set mapping output saved to: {final_output}.", silent) - # raise e TODO do we still want to raise an error in this case, after writing the file? return + except ResourceAcquisitionError as e: + msg = f"Unable to acquire resource from MaveDB: {e}" + _logger.critical(msg) + click.echo(f"Error: {msg}") + raise e await map_scoreset( metadata, records, output_path, vrs_version, prefer_genomic, silent ) diff --git a/src/dcd_mapping/mavedb_data.py b/src/dcd_mapping/mavedb_data.py index 06da30a..33617a5 100644 --- a/src/dcd_mapping/mavedb_data.py +++ b/src/dcd_mapping/mavedb_data.py @@ -32,6 +32,10 @@ _logger = logging.getLogger(__name__) +class ScoresetNotSupportedError(Exception): + """Raise when a score set cannot be mapped because it has characteristics that are not currently supported.""" + + def get_scoreset_urns() -> set[str]: """Fetch all scoreset URNs. Since species is annotated at the scoreset target level, we can't yet filter on anything like `homo sapien` -- meaning this is fairly slow. @@ -158,15 +162,15 @@ def get_scoreset_metadata( if len(metadata["targetGenes"]) > 1: msg = f"Multiple target genes for {scoreset_urn}. Multi-target score sets are not currently supported." - raise ResourceAcquisitionError(msg) + raise ScoresetNotSupportedError(msg) gene = metadata["targetGenes"][0] target_sequence_gene = gene.get("targetSequence") if target_sequence_gene is None: msg = f"No target sequence available for {scoreset_urn}. Accession-based score sets are not currently supported." - raise ResourceAcquisitionError(msg) + raise ScoresetNotSupportedError(msg) if not _metadata_response_is_human(metadata): msg = f"Experiment for {scoreset_urn} contains no human targets" - raise ResourceAcquisitionError(msg) + raise ScoresetNotSupportedError(msg) try: structured_data = ScoresetMetadata( urn=metadata["urn"], @@ -179,7 +183,7 @@ def get_scoreset_metadata( except (KeyError, ValidationError) as e: msg = f"Unable to extract metadata from API response for scoreset {scoreset_urn}: {e}" _logger.error(msg) - raise ResourceAcquisitionError(msg) from e + raise ScoresetNotSupportedError(msg) from e return structured_data From 4535bcd7e2ff98daa5a8ed68b97c4eb95ffe96e8 Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Mon, 26 Aug 2024 09:26:41 -0700 Subject: [PATCH 10/16] Add error messages for VRS id lookups --- src/dcd_mapping/annotate.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/dcd_mapping/annotate.py b/src/dcd_mapping/annotate.py index f594a4e..88fb6b9 100644 --- a/src/dcd_mapping/annotate.py +++ b/src/dcd_mapping/annotate.py @@ -456,7 +456,8 @@ def _get_mapped_reference_sequence( raise ValueError(msg) vrs_id = get_vrs_id_from_identifier(tx_output.np) if vrs_id is None: - raise ValueError + msg = "ID could not be acquired from Seqrepo for transcript identifier" + raise ValueError(msg) return MappedReferenceSequence( sequence_type=TargetSequenceType.PROTEIN, sequence_id=vrs_id, @@ -465,7 +466,8 @@ def _get_mapped_reference_sequence( seq_id = get_chromosome_identifier(align_result.chrom) vrs_id = get_vrs_id_from_identifier(seq_id) if vrs_id is None: - raise ValueError + msg = "ID could not be acquired from Seqrepo for chromosome identifier" + raise ValueError(msg) return MappedReferenceSequence( sequence_type=TargetSequenceType.DNA, sequence_id=vrs_id, From 589c0504a08c66e8c4d2211a03193c7304460105 Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Mon, 26 Aug 2024 09:58:18 -0700 Subject: [PATCH 11/16] Add error handling to api map function --- src/api/routers/map.py | 129 +++++++++++++++++++++++++++++------------ 1 file changed, 91 insertions(+), 38 deletions(-) diff --git a/src/api/routers/map.py b/src/api/routers/map.py index ca354d4..3f2ec66 100644 --- a/src/api/routers/map.py +++ b/src/api/routers/map.py @@ -1,21 +1,26 @@ +""""Provide mapping router""" from cool_seq_tool.schemas import AnnotationLayer -from fastapi import APIRouter +from fastapi import APIRouter, HTTPException +from requests import HTTPError -from dcd_mapping.align import align +from dcd_mapping.align import AlignmentError, BlatNotFoundError, align from dcd_mapping.annotate import ( _get_computed_reference_sequence, _get_mapped_reference_sequence, _set_scoreset_layer, annotate, ) +from dcd_mapping.lookup import DataLookupError from dcd_mapping.mavedb_data import ( + ScoresetNotSupportedError, get_raw_scoreset_metadata, get_scoreset_metadata, get_scoreset_records, ) -from dcd_mapping.schemas import ScoreAnnotation, ScoresetMapping -from dcd_mapping.transcripts import select_transcript -from dcd_mapping.vrs_map import vrs_map +from dcd_mapping.resource_utils import ResourceAcquisitionError +from dcd_mapping.schemas import ScoreAnnotation, ScoresetMapping, VrsVersion +from dcd_mapping.transcripts import TxSelectError, select_transcript +from dcd_mapping.vrs_map import VrsMapError, vrs_map router = APIRouter( prefix="/api/v1", tags=["mappings"], responses={404: {"description": "Not found"}} @@ -24,48 +29,98 @@ @router.post(path="/map/{urn}", status_code=200, response_model=ScoresetMapping) async def map_scoreset(urn: str) -> ScoresetMapping: - metadata = get_scoreset_metadata(urn) - records = get_scoreset_records(urn, True) + """Perform end-to-end mapping for a scoreset. - alignment_result = align(metadata, True) + :param urn: identifier for a scoreset. + :param output_path: optional path to save output at + :param vrs_version: version of VRS objects to output (1.3 or 2) + :param silent: if True, suppress console information output + """ + try: + metadata = get_scoreset_metadata(urn) + records = get_scoreset_records(urn, True) + except ScoresetNotSupportedError as e: + return ScoresetMapping( + metadata=None, + error_message=str(e).strip("'"), + ) + except ResourceAcquisitionError as e: + msg = f"Unable to acquire resource from MaveDB: {e}" + raise HTTPException(status_code=500, detail=msg) from e - transcript = await select_transcript(metadata, records, alignment_result) + try: + alignment_result = align(metadata, True) + except BlatNotFoundError as e: + msg = "BLAT command appears missing. Ensure it is available on the $PATH or use the environment variable BLAT_BIN_PATH to point to it. See instructions in the README prerequisites section for more." + raise HTTPException(status_code=500, detail=msg) from e + except ResourceAcquisitionError as e: + msg = f"BLAT resource could not be acquired: {e}" + raise HTTPException(status_code=500, detail=msg) from e + except AlignmentError as e: + return ScoresetMapping(metadata=metadata, error_message=str(e).strip("'")) - vrs_results = vrs_map(metadata, alignment_result, records, transcript, True) + try: + transcript = await select_transcript(metadata, records, alignment_result) + except (TxSelectError, KeyError, ValueError) as e: + return ScoresetMapping(metadata=metadata, error_message=str(e).strip("'")) + except HTTPError as e: + msg = f"HTTP error occurred during transcript selection: {e}" + raise HTTPException(status_code=500, detail=msg) from e + except DataLookupError as e: + msg = f"Data lookup error occurred during transcript selection: {e}" + raise HTTPException(status_code=500, detail=msg) from e - # TODO raise server error if vrs_results is None + try: + vrs_results = vrs_map(metadata, alignment_result, records, transcript, True) + except VrsMapError as e: + return ScoresetMapping(metadata=metadata, error_message=str(e).strip("'")) if vrs_results is None: - return None + return ScoresetMapping( + metadata=metadata, + error_message="No variant mappings available for this score set", + ) - vrs_results = annotate(vrs_results, transcript, metadata) + try: + vrs_results = annotate(vrs_results, transcript, metadata, VrsVersion.V_2) + except Exception as e: + return ScoresetMapping(metadata=metadata, error_message=str(e).strip("'")) + if vrs_results is None: + return ScoresetMapping( + metadata=metadata, + error_message="No annotated variant mappings available for this score set", + ) - raw_metadata = get_raw_scoreset_metadata(urn) - # TODO change vrs map back to always use only the preferred layer - # preferred_layers = {mapping.annotation_layer for mapping in vrs_results} - preferred_layers = { - _set_scoreset_layer(urn, vrs_results), - } + try: + raw_metadata = get_raw_scoreset_metadata(urn) + preferred_layers = { + _set_scoreset_layer(urn, vrs_results), + } - reference_sequences = { - layer: {"computed_reference_sequence": None, "mapped_reference_sequence": None} - for layer in AnnotationLayer - } + reference_sequences = { + layer: { + "computed_reference_sequence": None, + "mapped_reference_sequence": None, + } + for layer in AnnotationLayer + } - for layer in preferred_layers: - reference_sequences[layer][ - "computed_reference_sequence" - ] = _get_computed_reference_sequence(urn, layer, transcript) - reference_sequences[layer][ - "mapped_reference_sequence" - ] = _get_mapped_reference_sequence(layer, transcript, alignment_result) + for layer in preferred_layers: + reference_sequences[layer][ + "computed_reference_sequence" + ] = _get_computed_reference_sequence(urn, layer, transcript) + reference_sequences[layer][ + "mapped_reference_sequence" + ] = _get_mapped_reference_sequence(layer, transcript, alignment_result) - mapped_scores: list[ScoreAnnotation] = [] - for m in vrs_results: - if m.annotation_layer in preferred_layers: - # drop annotation layer from mapping object - mapped_scores.append(ScoreAnnotation(**m.model_dump())) + mapped_scores: list[ScoreAnnotation] = [] + for m in vrs_results: + if m.annotation_layer in preferred_layers: + # drop annotation layer from mapping object + mapped_scores.append(ScoreAnnotation(**m.model_dump())) + except Exception as e: + return ScoresetMapping(metadata=metadata, error_message=str(e).strip("'")) - output = ScoresetMapping( + return ScoresetMapping( metadata=raw_metadata, computed_protein_reference_sequence=reference_sequences[ AnnotationLayer.PROTEIN @@ -81,5 +136,3 @@ async def map_scoreset(urn: str) -> ScoresetMapping: ], mapped_scores=mapped_scores, ) - - return output From 5b3c2fb8da17767b0b2138289405ccd30e761931 Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Mon, 26 Aug 2024 14:40:00 -0700 Subject: [PATCH 12/16] Add API docstrings --- src/api/__init__.py | 1 + src/api/routers/__init__.py | 1 + src/api/server_main.py | 3 +-- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/api/__init__.py b/src/api/__init__.py index e69de29..2e93371 100644 --- a/src/api/__init__.py +++ b/src/api/__init__.py @@ -0,0 +1 @@ +"""Provide VRS mapping utilities API""" diff --git a/src/api/routers/__init__.py b/src/api/routers/__init__.py index e69de29..feb4a14 100644 --- a/src/api/routers/__init__.py +++ b/src/api/routers/__init__.py @@ -0,0 +1 @@ +"""Provide routers for dcd mapping API""" diff --git a/src/api/server_main.py b/src/api/server_main.py index b83da73..0df2a50 100644 --- a/src/api/server_main.py +++ b/src/api/server_main.py @@ -11,5 +11,4 @@ # If the application is not already being run within a uvicorn server, start uvicorn here. if __name__ == "__main__": - uvicorn.run(app, host="0.0.0.0", port=8000) - + uvicorn.run(app, host="0.0.0.0", port=8000) # noqa: S104 From 2fbecd0225154788ccea1f43ea44d626e058aaab Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Wed, 4 Sep 2024 15:41:55 -0700 Subject: [PATCH 13/16] Use Field() to set values for default fields in ScoresetMapping Since ScoresetMapping is returned as an API response, the typical way of setting default values does not work. --- src/dcd_mapping/schemas.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/dcd_mapping/schemas.py b/src/dcd_mapping/schemas.py index 6d5c58d..45f70e2 100644 --- a/src/dcd_mapping/schemas.py +++ b/src/dcd_mapping/schemas.py @@ -5,7 +5,7 @@ from cool_seq_tool.schemas import AnnotationLayer, Strand, TranscriptPriority from ga4gh.vrs._internal.models import Allele, Haplotype -from pydantic import BaseModel, ConfigDict, StrictBool, StrictInt, StrictStr +from pydantic import BaseModel, ConfigDict, Field, StrictBool, StrictInt, StrictStr from dcd_mapping import vrs_v1_schemas from dcd_mapping.version import dcd_mapping_version @@ -192,8 +192,10 @@ class ScoresetMapping(BaseModel): """Provide all mapped scores for a scoreset.""" metadata: Any # TODO get exact MaveDB metadata structure? - dcd_mapping_version: str = dcd_mapping_version - mapped_date_utc: str = datetime.datetime.now(tz=datetime.UTC).isoformat() + dcd_mapping_version: str = Field(default=dcd_mapping_version) + mapped_date_utc: str = Field( + default=datetime.datetime.now(tz=datetime.UTC).isoformat() + ) computed_protein_reference_sequence: ComputedReferenceSequence | None = None mapped_protein_reference_sequence: MappedReferenceSequence | None = None computed_genomic_reference_sequence: ComputedReferenceSequence | None = None From 46f1b2de50bafc6581972543f9605bfad5e0a09f Mon Sep 17 00:00:00 2001 From: Ben Capodanno Date: Thu, 12 Sep 2024 09:19:49 -0700 Subject: [PATCH 14/16] Make Authenticated Requests if Envvars are Set --- settings/.env.dev | 7 +++++++ src/dcd_mapping/mavedb_data.py | 20 +++++++++++++++----- src/dcd_mapping/resource_utils.py | 12 +++++++++++- 3 files changed, 33 insertions(+), 6 deletions(-) diff --git a/settings/.env.dev b/settings/.env.dev index f7980e6..4e761c7 100644 --- a/settings/.env.dev +++ b/settings/.env.dev @@ -19,6 +19,13 @@ POSTGRES_DB=gene_normalizer UTA_DB_URL=postgresql://anonymous:anonymous@uta.biocommons.org:5432/uta/uta_20180821 +#################################################################################################### +# Environment variables for MaveDB connection +#################################################################################################### + +MAVEDB_BASE_URL=http://localhost:8000 +MAVEDB_API_KEY= + #################################################################################################### # Environment variables for seqrepo #################################################################################################### diff --git a/src/dcd_mapping/mavedb_data.py b/src/dcd_mapping/mavedb_data.py index 33617a5..5b94e5d 100644 --- a/src/dcd_mapping/mavedb_data.py +++ b/src/dcd_mapping/mavedb_data.py @@ -15,7 +15,9 @@ from dcd_mapping.resource_utils import ( LOCAL_STORE_PATH, + MAVEDB_BASE_URL, ResourceAcquisitionError, + authentication_header, http_download, ) from dcd_mapping.schemas import ScoreRow, ScoresetMetadata, UniProtRef @@ -42,7 +44,11 @@ def get_scoreset_urns() -> set[str]: :return: set of URN strings """ - r = requests.get("https://api.mavedb.org/api/v1/experiments/", timeout=30) + r = requests.get( + f"{MAVEDB_BASE_URL}/api/v1/experiments/", + timeout=30, + headers=authentication_header(), + ) r.raise_for_status() scoreset_urn_lists = [ experiment["scoreSetUrns"] @@ -78,7 +84,11 @@ def get_human_urns() -> list[str]: scoreset_urns = get_scoreset_urns() human_scoresets: list[str] = [] for urn in scoreset_urns: - r = requests.get(f"https://api.mavedb.org/api/v1/score-sets/{urn}", timeout=30) + r = requests.get( + f"{MAVEDB_BASE_URL}/api/v1/score-sets/{urn}", + timeout=30, + headers=authentication_header(), + ) try: r.raise_for_status() except requests.exceptions.HTTPError: @@ -127,8 +137,8 @@ def get_raw_scoreset_metadata( dcd_mapping_dir = LOCAL_STORE_PATH metadata_file = dcd_mapping_dir / f"{scoreset_urn}_metadata.json" if not metadata_file.exists(): - url = f"https://api.mavedb.org/api/v1/score-sets/{scoreset_urn}" - r = requests.get(url, timeout=30) + url = f"{MAVEDB_BASE_URL}/api/v1/score-sets/{scoreset_urn}" + r = requests.get(url, timeout=30, headers=authentication_header()) try: r.raise_for_status() except requests.HTTPError as e: @@ -246,7 +256,7 @@ def get_scoreset_records( if urn == "urn:mavedb:00000053-a-1": _get_experiment_53_scores(scores_csv, silent) else: - url = f"https://api.mavedb.org/api/v1/score-sets/{urn}/scores" + url = f"{MAVEDB_BASE_URL}/api/v1/score-sets/{urn}/scores" try: http_download(url, scores_csv, silent) except requests.HTTPError as e: diff --git a/src/dcd_mapping/resource_utils.py b/src/dcd_mapping/resource_utils.py index c6269f2..153a99c 100644 --- a/src/dcd_mapping/resource_utils.py +++ b/src/dcd_mapping/resource_utils.py @@ -6,6 +6,9 @@ import requests from tqdm import tqdm +MAVEDB_API_KEY = os.environ.get("MAVEDB_API_KEY") +MAVEDB_BASE_URL = os.environ.get("MAVEDB_BASE_URL") + LOCAL_STORE_PATH = Path( os.environ.get( "DCD_MAPPING_RESOURCES_DIR", Path.home() / ".local" / "share" / "dcd_mapping" @@ -19,6 +22,11 @@ class ResourceAcquisitionError(Exception): """Raise when resource acquisition fails.""" +def authentication_header() -> dict | None: + """Fetch with api key envvar, if available.""" + return {"X-API-key": MAVEDB_API_KEY} if MAVEDB_API_KEY is not None else None + + def http_download(url: str, out_path: Path, silent: bool = True) -> Path: """Download a file via HTTP. @@ -30,7 +38,9 @@ def http_download(url: str, out_path: Path, silent: bool = True) -> Path: """ if not silent: click.echo(f"Downloading {out_path.name} to {out_path.parents[0].absolute()}") - with requests.get(url, stream=True, timeout=30) as r: + with requests.get( + url, stream=True, timeout=30, headers=authentication_header() + ) as r: r.raise_for_status() total_size = int(r.headers.get("content-length", 0)) with out_path.open("wb") as h: From 3a097b54181dd6115dc4896136e03c8eaf12d358 Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Fri, 13 Sep 2024 13:52:09 -0700 Subject: [PATCH 15/16] User python serialization mode instead of json FastAPI uses json serialization by default, which flattens the Allele sequence location field. To avoid this, use pydantic's serializer with python mode instead, and return the response directly as a JSON response. --- src/api/routers/map.py | 63 ++++++++++++++++++++++++++++-------------- 1 file changed, 43 insertions(+), 20 deletions(-) diff --git a/src/api/routers/map.py b/src/api/routers/map.py index 3f2ec66..8577263 100644 --- a/src/api/routers/map.py +++ b/src/api/routers/map.py @@ -1,6 +1,7 @@ """"Provide mapping router""" from cool_seq_tool.schemas import AnnotationLayer from fastapi import APIRouter, HTTPException +from fastapi.responses import JSONResponse from requests import HTTPError from dcd_mapping.align import AlignmentError, BlatNotFoundError, align @@ -57,12 +58,20 @@ async def map_scoreset(urn: str) -> ScoresetMapping: msg = f"BLAT resource could not be acquired: {e}" raise HTTPException(status_code=500, detail=msg) from e except AlignmentError as e: - return ScoresetMapping(metadata=metadata, error_message=str(e).strip("'")) + return JSONResponse( + content=ScoresetMapping( + metadata=metadata, error_message=str(e).strip("'") + ).model_dump(exclude_none=True) + ) try: transcript = await select_transcript(metadata, records, alignment_result) except (TxSelectError, KeyError, ValueError) as e: - return ScoresetMapping(metadata=metadata, error_message=str(e).strip("'")) + return JSONResponse( + content=ScoresetMapping( + metadata=metadata, error_message=str(e).strip("'") + ).model_dump(exclude_none=True) + ) except HTTPError as e: msg = f"HTTP error occurred during transcript selection: {e}" raise HTTPException(status_code=500, detail=msg) from e @@ -73,7 +82,11 @@ async def map_scoreset(urn: str) -> ScoresetMapping: try: vrs_results = vrs_map(metadata, alignment_result, records, transcript, True) except VrsMapError as e: - return ScoresetMapping(metadata=metadata, error_message=str(e).strip("'")) + return JSONResponse( + content=ScoresetMapping( + metadata=metadata, error_message=str(e).strip("'") + ).model_dump(exclude_none=True) + ) if vrs_results is None: return ScoresetMapping( metadata=metadata, @@ -83,7 +96,11 @@ async def map_scoreset(urn: str) -> ScoresetMapping: try: vrs_results = annotate(vrs_results, transcript, metadata, VrsVersion.V_2) except Exception as e: - return ScoresetMapping(metadata=metadata, error_message=str(e).strip("'")) + return JSONResponse( + content=ScoresetMapping( + metadata=metadata, error_message=str(e).strip("'") + ).model_dump(exclude_none=True) + ) if vrs_results is None: return ScoresetMapping( metadata=metadata, @@ -118,21 +135,27 @@ async def map_scoreset(urn: str) -> ScoresetMapping: # drop annotation layer from mapping object mapped_scores.append(ScoreAnnotation(**m.model_dump())) except Exception as e: - return ScoresetMapping(metadata=metadata, error_message=str(e).strip("'")) + return JSONResponse( + content=ScoresetMapping( + metadata=metadata, error_message=str(e).strip("'") + ).model_dump(exclude_none=True) + ) - return ScoresetMapping( - metadata=raw_metadata, - computed_protein_reference_sequence=reference_sequences[ - AnnotationLayer.PROTEIN - ]["computed_reference_sequence"], - mapped_protein_reference_sequence=reference_sequences[AnnotationLayer.PROTEIN][ - "mapped_reference_sequence" - ], - computed_genomic_reference_sequence=reference_sequences[ - AnnotationLayer.GENOMIC - ]["computed_reference_sequence"], - mapped_genomic_reference_sequence=reference_sequences[AnnotationLayer.GENOMIC][ - "mapped_reference_sequence" - ], - mapped_scores=mapped_scores, + return JSONResponse( + content=ScoresetMapping( + metadata=raw_metadata, + computed_protein_reference_sequence=reference_sequences[ + AnnotationLayer.PROTEIN + ]["computed_reference_sequence"], + mapped_protein_reference_sequence=reference_sequences[ + AnnotationLayer.PROTEIN + ]["mapped_reference_sequence"], + computed_genomic_reference_sequence=reference_sequences[ + AnnotationLayer.GENOMIC + ]["computed_reference_sequence"], + mapped_genomic_reference_sequence=reference_sequences[ + AnnotationLayer.GENOMIC + ]["mapped_reference_sequence"], + mapped_scores=mapped_scores, + ).model_dump(exclude_none=True) ) From cc8f3ad7bd99d08b1cc7a913d974d7cdf688c852 Mon Sep 17 00:00:00 2001 From: Ben Capodanno Date: Fri, 13 Sep 2024 15:14:29 -0700 Subject: [PATCH 16/16] Set envvar in Git Actions Tests --- .github/workflows/checks.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/checks.yaml b/.github/workflows/checks.yaml index 3656ac6..1c4587e 100644 --- a/.github/workflows/checks.yaml +++ b/.github/workflows/checks.yaml @@ -4,6 +4,8 @@ jobs: test: name: test py${{ matrix.python-version }} runs-on: ubuntu-latest + env: + MAVEDB_BASE_URL: https://api.mavedb.org strategy: matrix: python-version: ["3.11", "3.12"]