Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat!: use cat-vrs-python + va-spec-python #404

Merged
merged 12 commits into from
Nov 20, 2024
8 changes: 0 additions & 8 deletions docs/source/reference/api/metakb.schemas.annotation.rst

This file was deleted.

This file was deleted.

This file was deleted.

3 changes: 0 additions & 3 deletions docs/source/reference/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,8 @@ Data Schemas
:toctree: api/
:template: module_summary.rst

metakb.schemas.annotation
metakb.schemas.api
metakb.schemas.app
metakb.schemas.categorical_variation
metakb.schemas.variation_statement

Harvesters
----------
Expand Down
6 changes: 4 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@ requires-python = ">=3.10"
description = "A search interface for cancer variant interpretations assembled by aggregating and harmonizing across multiple cancer variant interpretation knowledgebases."
license = {file = "LICENSE"}
dependencies = [
"ga4gh.vrs~=2.0.0a10",
"ga4gh.vrs~=2.0.0a12",
"ga4gh.cat_vrs~=0.1.0",
"ga4gh.va_spec~=0.1.0",
"gene-normalizer[etl]~=0.4.1",
"variation-normalizer~=0.10.0",
"disease-normalizer[etl]~=0.5.0",
Expand All @@ -42,7 +44,7 @@ dependencies = [
dynamic = ["version"]

[project.optional-dependencies]
tests = ["pytest", "pytest-cov", "mock", "pytest-asyncio"]
tests = ["pytest", "pytest-cov", "mock", "pytest-asyncio", "deepdiff"]
dev = ["pre-commit>=3.7.1", "ruff==0.5.0"]
notebooks = ["ipykernel", "jupyterlab"]
docs = [
Expand Down
6 changes: 3 additions & 3 deletions src/metakb/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,15 +74,15 @@ def _get_credentials(

_CONSTRAINTS = {
"coding_constraint": "CREATE CONSTRAINT coding_constraint IF NOT EXISTS FOR (c:Coding) REQUIRE (c.code, c.label, c.system) IS UNIQUE;",
"gene_id_constraint": "CREATE CONSTRAINT gene_id_constraint IF NOT EXISTS FOR (n:Gene}) REQUIRE n.id IS UNIQUE;",
"gene_id_constraint": "CREATE CONSTRAINT gene_id_constraint IF NOT EXISTS FOR (n:Gene) REQUIRE n.id IS UNIQUE;",
"disease_id_constraint": "CREATE CONSTRAINT disease_id_constraint IF NOT EXISTS FOR (n:Disease) REQUIRE n.id IS UNIQUE;",
"therapeuticprocedure_id_constraint": "CREATE CONSTRAINT therapeuticprocedure_id_constraint IF NOT EXISTS FOR (n:TherapeuticProcedure) REQUIRE n.id IS UNIQUE;",
"variation_id_constraint": "CREATE CONSTRAINT variation_id_constraint IF NOT EXISTS FOR (n:Variation) REQUIRE n.id IS UNIQUE;",
"categoricalvariation_id_constraint": "CREATE CONSTRAINT categoricalvariation_id_constraint IF NOT EXISTS FOR (n:CategoricalVariation) REQUIRE n.id IS UNIQUE;",
"categoricalvariant_id_constraint": "CREATE CONSTRAINT categoricalvariant_id_constraint IF NOT EXISTS FOR (n:CategoricalVariant) REQUIRE n.id IS UNIQUE;",
"variantgroup_id_constraint": "CREATE CONSTRAINT variantgroup_id_constraint IF NOT EXISTS FOR (n:VariantGroup) REQUIRE n.id IS UNIQUE;",
"location_id_constraint": "CREATE CONSTRAINT location_id_constraint IF NOT EXISTS FOR (n:Location) REQUIRE n.id IS UNIQUE;",
"document_id_constraint": "CREATE CONSTRAINT document_id_constraint IF NOT EXISTS FOR (n:Document) REQUIRE n.id IS UNIQUE;",
"study_id_constraint": "CREATE CONSTRAINT study_id_constraint IF NOT EXISTS FOR (n:Study) REQUIRE n.id IS UNIQUE;",
"statement_id_constraint": "CREATE CONSTRAINT statement_id_constraint IF NOT EXISTS FOR (n:Statement) REQUIRE n.id IS UNIQUE;",
"method_id_constraint": "CREATE CONSTRAINT method_id_constraint IF NOT EXISTS FOR (n:Method) REQUIRE n.id IS UNIQUE;",
}

Expand Down
95 changes: 49 additions & 46 deletions src/metakb/load_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from neo4j import Driver, ManagedTransaction

from metakb.database import get_driver
from metakb.normalizers import VICC_NORMALIZER_DATA, ViccDiseaseNormalizerData

_logger = logging.getLogger(__name__)

Expand All @@ -16,7 +17,7 @@ def _create_parameterized_query(
) -> str:
"""Create parameterized query string for requested params if non-null in entity.

:param entity: entity to check against, eg a Variation or Study
:param entity: entity to check against, eg a Variation or Statement
:param params: Parameter names to check
:param entity_param_prefix: Prefix for parameter names in entity object
:return: Parameterized query, such as (`name:$name`)
Expand All @@ -41,18 +42,23 @@ def _add_mappings_and_exts_to_obj(obj: dict, obj_keys: list[str]) -> None:

extensions = obj.get("extensions", [])
for ext in extensions:
if ext["name"].endswith("_normalizer_data"):
obj_type = ext["name"].split("_normalizer_data")[0]
name = f"{obj_type}_normalizer_id"
obj[name] = ext["value"]["normalized_id"]
if ext["name"] == VICC_NORMALIZER_DATA:
for normalized_field in ViccDiseaseNormalizerData.model_fields:
normalized_val = ext["value"].get(normalized_field)
if normalized_val is None:
continue

name = f"normalizer_{normalized_field}"
obj[name] = normalized_val
obj_keys.append(f"{name}:${name}")
else:
name = "_".join(ext["name"].split()).lower()
val = ext["value"]
if isinstance(val, (dict | list)):
obj[name] = json.dumps(val)
else:
obj[name] = val
obj_keys.append(f"{name}:${name}")
obj_keys.append(f"{name}:${name}")


def _add_method(tx: ManagedTransaction, method: dict, ids_in_studies: set[str]) -> None:
Expand All @@ -69,11 +75,13 @@ def _add_method(tx: ManagedTransaction, method: dict, ids_in_studies: set[str])
MERGE (m:Method {id:$id, label:$label})
"""

is_reported_in = method.get("isReportedIn")
is_reported_in = method.get("reportedIn")
if is_reported_in:
# Method's documents are unique and do not currently have IDs
_add_document(tx, is_reported_in, ids_in_studies)
doc_doi = is_reported_in["doi"]
# They also only have one document
document = is_reported_in[0]
_add_document(tx, document, ids_in_studies)
doc_doi = document["doi"]
query += f"""
MERGE (d:Document {{ doi:'{doc_doi}' }})
MERGE (m) -[:IS_REPORTED_IN] -> (d)
Expand Down Expand Up @@ -253,21 +261,21 @@ def _add_variation(tx: ManagedTransaction, variation_in: dict) -> None:
tx.run(query, **v)


def _add_categorical_variation(
def _add_categorical_variant(
tx: ManagedTransaction,
categorical_variation_in: dict,
categorical_variant_in: dict,
ids_in_studies: set[str],
) -> None:
"""Add categorical variation objects to DB.
"""Add categorical variant objects to DB.

:param tx: Transaction object provided to transaction functions
:param categorical_variation_in: Categorical variation CDM object
:param categorical_variant_in: Categorical variant CDM object
:param ids_in_studies: IDs found in studies
"""
if categorical_variation_in["id"] not in ids_in_studies:
if categorical_variant_in["id"] not in ids_in_studies:
return

cv = categorical_variation_in.copy()
cv = categorical_variant_in.copy()

mp_nonnull_keys = [
_create_parameterized_query(
Expand All @@ -278,7 +286,7 @@ def _add_categorical_variation(
_add_mappings_and_exts_to_obj(cv, mp_nonnull_keys)
mp_keys = ", ".join(mp_nonnull_keys)

defining_context = cv["definingContext"]
defining_context = cv["constraints"][0]["definingContext"]
_add_variation(tx, defining_context)
dc_type = defining_context["type"]

Expand All @@ -293,9 +301,9 @@ def _add_categorical_variation(

query = f"""
{members_match}
MERGE (dc:{dc_type}:Variation {{ id: '{defining_context['id']}' }})
MERGE (dc:Variation:{dc_type} {{ id: '{defining_context['id']}' }})
MERGE (dc) -[:HAS_LOCATION] -> (loc)
MERGE (v:{cv['type']}:CategoricalVariation {{ {mp_keys} }})
MERGE (v:Variation:{cv['type']} {{ {mp_keys} }})
MERGE (v) -[:HAS_DEFINING_CONTEXT] -> (dc)
{members_relation}
"""
Expand Down Expand Up @@ -330,7 +338,7 @@ def _add_document(
document = document_in.copy()
formatted_keys = [
_create_parameterized_query(
document, ("id", "label", "title", "pmid", "url", "doi")
document, ("id", "label", "title", "pmid", "urls", "doi")
)
]

Expand Down Expand Up @@ -365,11 +373,11 @@ def _add_obj_id_to_set(obj: dict, ids_set: set[str]) -> None:
for study in studies:
for obj in [
study.get("specifiedBy"), # method
study.get("isReportedIn"),
study.get("variant"),
study.get("therapeutic"),
study.get("tumorType"),
study.get("qualifiers", {}).get("geneContext"),
study.get("reportedIn"),
study.get("subjectVariant"),
study.get("objectTherapeutic"),
study.get("conditionQualifier"),
study.get("geneContextQualifier"),
]:
if obj:
if isinstance(obj, list):
Expand All @@ -385,7 +393,7 @@ def _add_study(tx: ManagedTransaction, study_in: dict) -> None:
"""Add study node and its relationships

:param tx: Transaction object provided to transaction functions
:param study_in: Study CDM object
:param study_in: Statement CDM object
"""
study = study_in.copy()
study_type = study["type"]
Expand All @@ -396,23 +404,22 @@ def _add_study(tx: ManagedTransaction, study_in: dict) -> None:
match_line = ""
rel_line = ""

is_reported_in_docs = study.get("isReportedIn", [])
is_reported_in_docs = study.get("reportedIn", [])
for ri_doc in is_reported_in_docs:
ri_doc_id = ri_doc["id"]
name = f"doc_{ri_doc_id.split(':')[-1]}"
match_line += f"MERGE ({name} {{ id: '{ri_doc_id}'}})\n"
rel_line += f"MERGE (s) -[:IS_REPORTED_IN] -> ({name})\n"

qualifiers = study.get("qualifiers")
if qualifiers:
allele_origin = qualifiers.get("alleleOrigin")
study["alleleOrigin"] = allele_origin
match_line += "SET s.alleleOrigin=$alleleOrigin\n"
allele_origin = study.get("alleleOriginQualifier")
if allele_origin:
study["alleleOriginQualifier"] = allele_origin
match_line += "SET s.alleleOriginQualifier=$alleleOriginQualifier\n"

gene_context_id = qualifiers.get("geneContext", {}).get("id")
if gene_context_id:
match_line += f"MERGE (g:Gene {{id: '{gene_context_id}'}})\n"
rel_line += "MERGE (s) -[:HAS_GENE_CONTEXT] -> (g)\n"
gene_context_id = study.get("geneContextQualifier", {}).get("id")
if gene_context_id:
match_line += f"MERGE (g:Gene {{id: '{gene_context_id}'}})\n"
rel_line += "MERGE (s) -[:HAS_GENE_CONTEXT] -> (g)\n"

method_id = study["specifiedBy"]["id"]
match_line += f"MERGE (m {{ id: '{method_id}' }})\n"
Expand All @@ -433,24 +440,20 @@ def _add_study(tx: ManagedTransaction, study_in: dict) -> None:
match_line += f"MERGE (c:Coding {{ {coding_keys} }})\n"
rel_line += "MERGE (s) -[:HAS_STRENGTH] -> (c)\n"

variant_id = study["variant"]["id"]
if study["variant"]["type"] == "ProteinSequenceConsequence":
v_parent_type = "CategoricalVariation"
else:
v_parent_type = "Variation"
match_line += f"MERGE (v:{v_parent_type} {{ id: '{variant_id}' }})\n"
variant_id = study["subjectVariant"]["id"]
match_line += f"MERGE (v:Variation {{ id: '{variant_id}' }})\n"
rel_line += "MERGE (s) -[:HAS_VARIANT] -> (v)\n"

therapeutic_id = study["therapeutic"]["id"]
therapeutic_id = study["objectTherapeutic"]["id"]
match_line += f"MERGE (t:TherapeuticProcedure {{ id: '{therapeutic_id}' }})\n"
rel_line += "MERGE (s) -[:HAS_THERAPEUTIC] -> (t)\n"

tumor_type_id = study["tumorType"]["id"]
tumor_type_id = study["conditionQualifier"]["id"]
match_line += f"MERGE (tt:Condition {{ id: '{tumor_type_id}' }})\n"
rel_line += "MERGE (s) -[:HAS_TUMOR_TYPE] -> (tt)\n"

query = f"""
MERGE (s:{study_type}:Study {{ {study_keys} }})
MERGE (s:{study_type}:StudyStatement:Statement {{ {study_keys} }})
{match_line}
{rel_line}
"""
Expand All @@ -472,8 +475,8 @@ def add_transformed_data(driver: Driver, data: dict) -> None:
with driver.session() as session:
loaded_study_count = 0

for cv in data.get("categorical_variations", []):
session.execute_write(_add_categorical_variation, cv, ids_in_studies)
for cv in data.get("categorical_variants", []):
session.execute_write(_add_categorical_variant, cv, ids_in_studies)

for doc in data.get("documents", []):
session.execute_write(_add_document, doc, ids_in_studies)
Expand Down
24 changes: 24 additions & 0 deletions src/metakb/normalizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import os
from collections.abc import Iterable
from enum import Enum
from typing import Literal

from botocore.exceptions import TokenRetrievalError
from disease.cli import update_db as update_disease_db
Expand All @@ -22,6 +23,7 @@
from gene.database.database import AWS_ENV_VAR_NAME as GENE_AWS_ENV_VAR_NAME
from gene.query import QueryHandler as GeneQueryHandler
from gene.schemas import NormalizeService as NormalizedGene
from pydantic import BaseModel
from therapy.cli import update_normalizer_db as update_therapy_db
from therapy.database import create_db as create_therapy_db
from therapy.database.database import AWS_ENV_VAR_NAME as THERAPY_AWS_ENV_VAR_NAME
Expand All @@ -42,6 +44,28 @@
_logger = logging.getLogger(__name__)


class ViccNormalizerData(BaseModel, extra="forbid"):
"""Define model for representing VICC normalizer data"""

id: str
label: str


class ViccDiseaseNormalizerData(ViccNormalizerData, extra="forbid"):
"""Define model for representing VICC disease normalizer data"""

mondo_id: str | None = None


VICC_NORMALIZER_DATA = "vicc_normalizer_data"


class ViccNormalizerDataExtension(Extension):
"""Define model for representing VICC normalizer data as an Extension"""

name: Literal["vicc_normalizer_data"] = VICC_NORMALIZER_DATA


class ViccNormalizers:
"""Manage VICC concept normalization services.

Expand Down
Loading