Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MaveDB Mapping v2024.1.2 #30

Merged
merged 9 commits into from
Nov 21, 2024
22 changes: 17 additions & 5 deletions src/api/routers/map.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
""""Provide mapping router"""
from pathlib import Path

from cool_seq_tool.schemas import AnnotationLayer
from fastapi import APIRouter, HTTPException
from fastapi.responses import JSONResponse
Expand All @@ -17,6 +19,7 @@
get_raw_scoreset_metadata,
get_scoreset_metadata,
get_scoreset_records,
with_mavedb_score_set,
)
from dcd_mapping.resource_utils import ResourceAcquisitionError
from dcd_mapping.schemas import ScoreAnnotation, ScoresetMapping, VrsVersion
Expand All @@ -29,7 +32,8 @@


@router.post(path="/map/{urn}", status_code=200, response_model=ScoresetMapping)
async def map_scoreset(urn: str) -> ScoresetMapping:
@with_mavedb_score_set
async def map_scoreset(urn: str, store_path: Path | None = None) -> ScoresetMapping:
"""Perform end-to-end mapping for a scoreset.

:param urn: identifier for a scoreset.
Expand All @@ -38,8 +42,8 @@ async def map_scoreset(urn: str) -> ScoresetMapping:
:param silent: if True, suppress console information output
"""
try:
metadata = get_scoreset_metadata(urn)
records = get_scoreset_records(urn, True)
metadata = get_scoreset_metadata(urn, store_path)
records = get_scoreset_records(urn, True, store_path)
except ScoresetNotSupportedError as e:
return ScoresetMapping(
metadata=None,
Expand All @@ -49,6 +53,14 @@ async def map_scoreset(urn: str) -> ScoresetMapping:
msg = f"Unable to acquire resource from MaveDB: {e}"
raise HTTPException(status_code=500, detail=msg) from e

if not records:
return JSONResponse(
content=ScoresetMapping(
metadata=metadata,
error_message="Score set contains no variants to map",
).model_dump(exclude_none=True)
)

try:
alignment_result = align(metadata, True)
except BlatNotFoundError as e:
Expand Down Expand Up @@ -108,7 +120,7 @@ async def map_scoreset(urn: str) -> ScoresetMapping:
)

try:
raw_metadata = get_raw_scoreset_metadata(urn)
raw_metadata = get_raw_scoreset_metadata(urn, store_path)
preferred_layers = {
_set_scoreset_layer(urn, vrs_results),
}
Expand All @@ -124,7 +136,7 @@ async def map_scoreset(urn: str) -> ScoresetMapping:
for layer in preferred_layers:
reference_sequences[layer][
"computed_reference_sequence"
] = _get_computed_reference_sequence(urn, layer, transcript)
] = _get_computed_reference_sequence(metadata, layer, transcript)
reference_sequences[layer][
"mapped_reference_sequence"
] = _get_mapped_reference_sequence(layer, transcript, alignment_result)
Expand Down
25 changes: 6 additions & 19 deletions src/dcd_mapping/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@
get_seqrepo,
get_vrs_id_from_identifier,
)
from dcd_mapping.mavedb_data import get_raw_scoreset_metadata, get_scoreset_metadata
from dcd_mapping.resource_utils import LOCAL_STORE_PATH
from dcd_mapping.schemas import (
AlignmentResult,
Expand Down Expand Up @@ -409,7 +408,7 @@ def annotate(


def _get_computed_reference_sequence(
ss: str,
metadata: ScoresetMetadata,
layer: AnnotationLayer,
tx_output: TxSelectResult | None = None,
) -> ComputedReferenceSequence:
Expand All @@ -429,7 +428,6 @@ def _get_computed_reference_sequence(
sequence_type=TargetSequenceType.PROTEIN,
sequence_id=seq_id,
)
metadata = get_scoreset_metadata(ss)
seq_id = f"ga4gh:SQ.{sha512t24u(metadata.target_sequence.encode('ascii'))}"
return ComputedReferenceSequence(
sequence=metadata.target_sequence,
Expand Down Expand Up @@ -516,7 +514,7 @@ def write_scoreset_mapping_to_json(


def save_mapped_output_json(
urn: str,
metadata: ScoresetMetadata,
mappings: list[ScoreAnnotationWithLayer],
align_result: AlignmentResult,
tx_output: TxSelectResult | None,
Expand All @@ -533,10 +531,9 @@ def save_mapped_output_json(
<dcd_mapping_data_dir>/urn:mavedb:00000XXX-X-X_mapping_<ISO8601 datetime>.json
:return: output location
"""
metadata = get_raw_scoreset_metadata(urn)
if preferred_layer_only:
preferred_layers = {
_set_scoreset_layer(urn, mappings),
_set_scoreset_layer(metadata.urn, mappings),
}
else:
preferred_layers = {mapping.annotation_layer for mapping in mappings}
Expand All @@ -549,20 +546,10 @@ def save_mapped_output_json(
for layer in preferred_layers:
reference_sequences[layer][
"computed_reference_sequence"
] = _get_computed_reference_sequence(urn, layer, tx_output)
] = _get_computed_reference_sequence(metadata, layer, tx_output)
reference_sequences[layer][
"mapped_reference_sequence"
] = _get_mapped_reference_sequence(layer, tx_output, align_result)
# except Exception as e:
# _logger.warning(
# str(e)
# )
# output = ScoresetMapping(
# metadata=metadata,
# error_message = str(e).strip("'")
# )

# return write_scoreset_mapping_to_json

mapped_scores: list[ScoreAnnotation] = []
for m in mappings:
Expand All @@ -573,7 +560,7 @@ def save_mapped_output_json(
mapped_scores.append(ScoreAnnotation(**m.model_dump()))

output = ScoresetMapping(
metadata=metadata,
metadata=metadata.model_dump(),
computed_protein_reference_sequence=reference_sequences[
AnnotationLayer.PROTEIN
]["computed_reference_sequence"],
Expand All @@ -589,4 +576,4 @@ def save_mapped_output_json(
mapped_scores=mapped_scores,
)

return write_scoreset_mapping_to_json(urn, output, output_path)
return write_scoreset_mapping_to_json(metadata.urn, output, output_path)
23 changes: 20 additions & 3 deletions src/dcd_mapping/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
ScoresetNotSupportedError,
get_scoreset_metadata,
get_scoreset_records,
with_mavedb_score_set,
)
from dcd_mapping.resource_utils import ResourceAcquisitionError
from dcd_mapping.schemas import (
Expand Down Expand Up @@ -264,7 +265,7 @@ async def map_scoreset(
return
try:
final_output = save_mapped_output_json(
metadata.urn,
metadata,
vrs_results,
alignment_result,
transcript,
Expand All @@ -287,12 +288,14 @@ async def map_scoreset(
_emit_info(f"Annotated scores saved to: {final_output}.", silent)


@with_mavedb_score_set
async def map_scoreset_urn(
urn: str,
output_path: Path | None = None,
vrs_version: VrsVersion = VrsVersion.V_2,
prefer_genomic: bool = False,
silent: bool = True,
store_path: Path | None = None,
) -> None:
"""Perform end-to-end mapping for a scoreset.

Expand All @@ -302,8 +305,8 @@ async def map_scoreset_urn(
:param silent: if True, suppress console information output
"""
try:
metadata = get_scoreset_metadata(urn)
records = get_scoreset_records(urn, silent)
metadata = get_scoreset_metadata(urn, store_path)
records = get_scoreset_records(urn, silent, store_path)
except ScoresetNotSupportedError as e:
_emit_info(f"Score set not supported: {e}", silent, logging.ERROR)
final_output = write_scoreset_mapping_to_json(
Expand All @@ -321,6 +324,20 @@ async def map_scoreset_urn(
_logger.critical(msg)
click.echo(f"Error: {msg}")
raise e

if not records:
_emit_info("Score set contains no variants to map", silent, logging.ERROR)
final_output = write_scoreset_mapping_to_json(
urn,
ScoresetMapping(
metadata=metadata,
error_message="Score set contains no variants to map",
),
output_path,
)
_emit_info(f"Score set mapping output saved to: {final_output}.", silent)
return

await map_scoreset(
metadata, records, output_path, vrs_version, prefer_genomic, silent
)
30 changes: 29 additions & 1 deletion src/dcd_mapping/mavedb_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,14 @@

Much of this can/should be replaced by the ``mavetools`` library? (and/or ``wags-tails``.)
"""

import csv
import json
import logging
import tempfile
import zipfile
from collections.abc import Callable
from functools import wraps
from pathlib import Path
from typing import Any

Expand All @@ -20,7 +23,7 @@
authentication_header,
http_download,
)
from dcd_mapping.schemas import ScoreRow, ScoresetMetadata, UniProtRef
from dcd_mapping.schemas import ScoreRow, ScoresetMapping, ScoresetMetadata, UniProtRef

__all__ = [
"get_scoreset_urns",
Expand Down Expand Up @@ -135,6 +138,7 @@ def get_raw_scoreset_metadata(
"""
if not dcd_mapping_dir:
dcd_mapping_dir = LOCAL_STORE_PATH

metadata_file = dcd_mapping_dir / f"{scoreset_urn}_metadata.json"
if not metadata_file.exists():
url = f"{MAVEDB_BASE_URL}/api/v1/score-sets/{scoreset_urn}"
Expand Down Expand Up @@ -265,3 +269,27 @@ def get_scoreset_records(
raise ResourceAcquisitionError(msg) from e

return _load_scoreset_records(scores_csv)


def with_mavedb_score_set(fn: Callable) -> Callable:
@wraps(fn)
async def wrapper(*args, **kwargs) -> ScoresetMapping: # noqa: ANN002
urn = args[0] if args else kwargs["urn"]
silent = kwargs.get("silent", False)

with tempfile.TemporaryDirectory(
prefix=f"{LOCAL_STORE_PATH.as_posix()}/"
) as temp_dir:
# Set up metadata and scores for the current run. Now they will be accessible by these functions
# without the need to download the data again.
temp_dir_as_path = Path(temp_dir)
get_scoreset_metadata(urn, temp_dir_as_path)
get_scoreset_records(urn, silent, temp_dir_as_path)

# Pass the storage path of the temp directory to the wrapped function as a kwarg.
kwargs["store_path"] = temp_dir_as_path
v: ScoresetMapping = await fn(*args, **kwargs)

return v

return wrapper
6 changes: 3 additions & 3 deletions src/dcd_mapping/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@ class TargetSequenceType(str, Enum):
class TargetType(str, Enum):
"""Define target gene types."""

PROTEIN_CODING = "Protein coding"
REGULATORY = "Regulatory"
OTHER_NC = "Other noncoding"
PROTEIN_CODING = "protein_coding"
REGULATORY = "regulatory"
OTHER_NC = "other_noncoding"


class VrsVersion(str, Enum):
Expand Down
2 changes: 1 addition & 1 deletion src/dcd_mapping/version.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
"""Provide dcd mapping version"""

dcd_mapping_version = "2024.1.1"
dcd_mapping_version = "2024.1.2"
Loading
Loading