From 34fe15fe6f1e41e023a7f05d234d3b4c8903ea1a Mon Sep 17 00:00:00 2001 From: Kori Kuzma Date: Tue, 12 Nov 2024 20:26:39 -0500 Subject: [PATCH] wip: get db tests to pass --- src/metakb/database.py | 4 +- src/metakb/load_data.py | 56 +++++----- tests/conftest.py | 1 + .../therapeutic/civic_harvester.json | 6 +- tests/unit/database/test_database.py | 100 ++++++++++-------- 5 files changed, 87 insertions(+), 80 deletions(-) diff --git a/src/metakb/database.py b/src/metakb/database.py index 350ba172..92bf20c8 100644 --- a/src/metakb/database.py +++ b/src/metakb/database.py @@ -74,7 +74,7 @@ def _get_credentials( _CONSTRAINTS = { "coding_constraint": "CREATE CONSTRAINT coding_constraint IF NOT EXISTS FOR (c:Coding) REQUIRE (c.code, c.label, c.system) IS UNIQUE;", - "gene_id_constraint": "CREATE CONSTRAINT gene_id_constraint IF NOT EXISTS FOR (n:Gene}) REQUIRE n.id IS UNIQUE;", + "gene_id_constraint": "CREATE CONSTRAINT gene_id_constraint IF NOT EXISTS FOR (n:Gene) REQUIRE n.id IS UNIQUE;", "disease_id_constraint": "CREATE CONSTRAINT disease_id_constraint IF NOT EXISTS FOR (n:Disease) REQUIRE n.id IS UNIQUE;", "therapeuticprocedure_id_constraint": "CREATE CONSTRAINT therapeuticprocedure_id_constraint IF NOT EXISTS FOR (n:TherapeuticProcedure) REQUIRE n.id IS UNIQUE;", "variation_id_constraint": "CREATE CONSTRAINT variation_id_constraint IF NOT EXISTS FOR (n:Variation) REQUIRE n.id IS UNIQUE;", @@ -82,7 +82,7 @@ def _get_credentials( "variantgroup_id_constraint": "CREATE CONSTRAINT variantgroup_id_constraint IF NOT EXISTS FOR (n:VariantGroup) REQUIRE n.id IS UNIQUE;", "location_id_constraint": "CREATE CONSTRAINT location_id_constraint IF NOT EXISTS FOR (n:Location) REQUIRE n.id IS UNIQUE;", "document_id_constraint": "CREATE CONSTRAINT document_id_constraint IF NOT EXISTS FOR (n:Document) REQUIRE n.id IS UNIQUE;", - "study_id_constraint": "CREATE CONSTRAINT study_id_constraint IF NOT EXISTS FOR (n:Study) REQUIRE n.id IS UNIQUE;", + "statement_id_constraint": "CREATE CONSTRAINT study_id_constraint IF NOT EXISTS FOR (n:Statement) REQUIRE n.id IS UNIQUE;", "method_id_constraint": "CREATE CONSTRAINT method_id_constraint IF NOT EXISTS FOR (n:Method) REQUIRE n.id IS UNIQUE;", } diff --git a/src/metakb/load_data.py b/src/metakb/load_data.py index 369e797f..e43be50e 100644 --- a/src/metakb/load_data.py +++ b/src/metakb/load_data.py @@ -16,7 +16,7 @@ def _create_parameterized_query( ) -> str: """Create parameterized query string for requested params if non-null in entity. - :param entity: entity to check against, eg a Variation or Study + :param entity: entity to check against, eg a Variation or Statement :param params: Parameter names to check :param entity_param_prefix: Prefix for parameter names in entity object :return: Parameterized query, such as (`name:$name`) @@ -72,8 +72,10 @@ def _add_method(tx: ManagedTransaction, method: dict, ids_in_studies: set[str]) is_reported_in = method.get("reportedIn") if is_reported_in: # Method's documents are unique and do not currently have IDs - _add_document(tx, is_reported_in, ids_in_studies) - doc_doi = is_reported_in["doi"] + # They also only have one document + document = is_reported_in[0] + _add_document(tx, document, ids_in_studies) + doc_doi = document["doi"] query += f""" MERGE (d:Document {{ doi:'{doc_doi}' }}) MERGE (m) -[:IS_REPORTED_IN] -> (d) @@ -278,7 +280,7 @@ def _add_categorical_variation( _add_mappings_and_exts_to_obj(cv, mp_nonnull_keys) mp_keys = ", ".join(mp_nonnull_keys) - defining_context = cv["definingContext"] + defining_context = cv["constraints"][0]["definingContext"] _add_variation(tx, defining_context) dc_type = defining_context["type"] @@ -293,9 +295,9 @@ def _add_categorical_variation( query = f""" {members_match} - MERGE (dc:{dc_type}:Variation {{ id: '{defining_context['id']}' }}) + MERGE (dc:Variation:{dc_type} {{ id: '{defining_context['id']}' }}) MERGE (dc) -[:HAS_LOCATION] -> (loc) - MERGE (v:{cv['type']}:CategoricalVariation {{ {mp_keys} }}) + MERGE (v:Variation:{cv['type']} {{ {mp_keys} }}) MERGE (v) -[:HAS_DEFINING_CONTEXT] -> (dc) {members_relation} """ @@ -330,7 +332,7 @@ def _add_document( document = document_in.copy() formatted_keys = [ _create_parameterized_query( - document, ("id", "label", "title", "pmid", "url", "doi") + document, ("id", "label", "title", "pmid", "urls", "doi") ) ] @@ -366,10 +368,10 @@ def _add_obj_id_to_set(obj: dict, ids_set: set[str]) -> None: for obj in [ study.get("specifiedBy"), # method study.get("reportedIn"), - study.get("variant"), - study.get("therapeutic"), - study.get("tumorType"), - study.get("qualifiers", {}).get("geneContext"), + study.get("subjectVariant"), + study.get("objectTherapeutic"), + study.get("conditionQualifier"), + study.get("geneContextQualifier"), ]: if obj: if isinstance(obj, list): @@ -385,7 +387,7 @@ def _add_study(tx: ManagedTransaction, study_in: dict) -> None: """Add study node and its relationships :param tx: Transaction object provided to transaction functions - :param study_in: Study CDM object + :param study_in: Statement CDM object """ study = study_in.copy() study_type = study["type"] @@ -403,16 +405,14 @@ def _add_study(tx: ManagedTransaction, study_in: dict) -> None: match_line += f"MERGE ({name} {{ id: '{ri_doc_id}'}})\n" rel_line += f"MERGE (s) -[:IS_REPORTED_IN] -> ({name})\n" - qualifiers = study.get("qualifiers") - if qualifiers: - allele_origin = qualifiers.get("alleleOrigin") - study["alleleOrigin"] = allele_origin - match_line += "SET s.alleleOrigin=$alleleOrigin\n" + allele_origin = study.get("alleleOriginQualifier") + study["alleleOriginQualifier"] = allele_origin + match_line += "SET s.alleleOriginQualifier=$alleleOriginQualifier\n" - gene_context_id = qualifiers.get("geneContext", {}).get("id") - if gene_context_id: - match_line += f"MERGE (g:Gene {{id: '{gene_context_id}'}})\n" - rel_line += "MERGE (s) -[:HAS_GENE_CONTEXT] -> (g)\n" + gene_context_id = study.get("geneContextQualifier", {}).get("id") + if gene_context_id: + match_line += f"MERGE (g:Gene {{id: '{gene_context_id}'}})\n" + rel_line += "MERGE (s) -[:HAS_GENE_CONTEXT] -> (g)\n" method_id = study["specifiedBy"]["id"] match_line += f"MERGE (m {{ id: '{method_id}' }})\n" @@ -433,24 +433,20 @@ def _add_study(tx: ManagedTransaction, study_in: dict) -> None: match_line += f"MERGE (c:Coding {{ {coding_keys} }})\n" rel_line += "MERGE (s) -[:HAS_STRENGTH] -> (c)\n" - variant_id = study["variant"]["id"] - if study["variant"]["type"] == "ProteinSequenceConsequence": - v_parent_type = "CategoricalVariation" - else: - v_parent_type = "Variation" - match_line += f"MERGE (v:{v_parent_type} {{ id: '{variant_id}' }})\n" + variant_id = study["subjectVariant"]["id"] + match_line += f"MERGE (v:Variation {{ id: '{variant_id}' }})\n" rel_line += "MERGE (s) -[:HAS_VARIANT] -> (v)\n" - therapeutic_id = study["therapeutic"]["id"] + therapeutic_id = study["objectTherapeutic"]["id"] match_line += f"MERGE (t:TherapeuticProcedure {{ id: '{therapeutic_id}' }})\n" rel_line += "MERGE (s) -[:HAS_THERAPEUTIC] -> (t)\n" - tumor_type_id = study["tumorType"]["id"] + tumor_type_id = study["conditionQualifier"]["id"] match_line += f"MERGE (tt:Condition {{ id: '{tumor_type_id}' }})\n" rel_line += "MERGE (s) -[:HAS_TUMOR_TYPE] -> (tt)\n" query = f""" - MERGE (s:{study_type}:Study {{ {study_keys} }}) + MERGE (s:{study_type}:Statement {{ {study_keys} }}) {match_line} {rel_line} """ diff --git a/tests/conftest.py b/tests/conftest.py index 690193e7..bdbb9aa9 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -602,6 +602,7 @@ def civic_tid146(): "alternativeLabels": [ "BIBW2992", "BIBW 2992", + "BIBW-2992", "(2e)-N-(4-(3-Chloro-4-Fluoroanilino)-7-(((3s)-Oxolan-3-yl)Oxy)Quinoxazolin-6-yl)-4-(Dimethylamino)But-2-Enamide", ], "extensions": [ diff --git a/tests/data/transformers/therapeutic/civic_harvester.json b/tests/data/transformers/therapeutic/civic_harvester.json index a5a694cc..1f7d6ecc 100644 --- a/tests/data/transformers/therapeutic/civic_harvester.json +++ b/tests/data/transformers/therapeutic/civic_harvester.json @@ -26,7 +26,8 @@ "aliases": [ "(2e)-N-(4-(3-Chloro-4-Fluoroanilino)-7-(((3s)-Oxolan-3-yl)Oxy)Quinoxazolin-6-yl)-4-(Dimethylamino)But-2-Enamide", "BIBW 2992", - "BIBW2992" + "BIBW2992", + "BIBW-2992" ], "type": "therapie" } @@ -270,7 +271,8 @@ "aliases": [ "(2e)-N-(4-(3-Chloro-4-Fluoroanilino)-7-(((3s)-Oxolan-3-yl)Oxy)Quinoxazolin-6-yl)-4-(Dimethylamino)But-2-Enamide", "BIBW 2992", - "BIBW2992" + "BIBW2992", + "BIBW-2992" ], "type": "therapie" } diff --git a/tests/unit/database/test_database.py b/tests/unit/database/test_database.py index dc1206c9..a614bdcc 100644 --- a/tests/unit/database/test_database.py +++ b/tests/unit/database/test_database.py @@ -94,7 +94,7 @@ def check_study_relation(driver: Driver): def _check_function(value_label: str): query = f""" MATCH (d:{value_label}) - OPTIONAL MATCH (d)<-[:HAS_{value_label.upper()}]-(s:Study) + OPTIONAL MATCH (d)<-[:HAS_{value_label.upper()}]-(s:Statement) WITH d, COUNT(s) as s_count WHERE s_count < 1 RETURN COUNT(s_count) @@ -207,7 +207,12 @@ def test_gene_rules( """Verify property and relationship rules for Gene nodes.""" check_unique_property("Gene", "id") check_relation_count( - "Gene", "Study", "HAS_GENE_CONTEXT", direction="in", min_rels=1, max_rels=None + "Gene", + "Statement", + "HAS_GENE_CONTEXT", + direction="in", + min_rels=1, + max_rels=None, ) expected_labels = [{"Gene"}] @@ -241,7 +246,7 @@ def test_variation_rules( # members dont have defining context check_relation_count( "Variation", - "CategoricalVariation", + "CategoricalVariant", "HAS_DEFINING_CONTEXT", direction="in", min_rels=0, @@ -249,25 +254,23 @@ def test_variation_rules( ) check_relation_count( "Variation", - "CategoricalVariation", + "CategoricalVariant", "HAS_MEMBERS", min_rels=0, max_rels=None, direction="in", ) - expected_labels = [{"Variation", "Allele"}] - check_node_labels("Variation", expected_labels, 1) + expected_labels = [{"Variation", "Allele"}, {"Variation", "CategoricalVariant"}] + check_node_labels("Variation", expected_labels, 2) - # all Alleles are Variations and all Variations are Alleles + # all Variations are either Alleles or CategoricalVariants, and all Alleles and CategoricalVariants are Variation label_query = """ - MATCH (v:Variation) - WHERE NOT (v:Allele) - RETURN COUNT(v) - UNION - MATCH (v:Allele) - WHERE NOT (v:Variation) - RETURN COUNT(v) + MATCH (v) + RETURN + SUM(CASE WHEN (v:Variation AND NOT (v:Allele OR v:CategoricalVariant)) THEN 1 ELSE 0 END) + + SUM(CASE WHEN (v:Allele AND NOT v:Variation) THEN 1 ELSE 0 END) + + SUM(CASE WHEN (v:CategoricalVariant AND NOT v:Variation) THEN 1 ELSE 0 END) """ with driver.session() as s: record = s.run(label_query).single() @@ -280,8 +283,6 @@ def test_variation_rules( "digest", "state", "expression_hgvs_p", - "expression_hgvs_c", - "expression_hgvs_g", "type", } @@ -301,8 +302,6 @@ def test_variation_rules( expected_g.append(val) assert v["expression_hgvs_p"] == expected_p - assert set(v["expression_hgvs_c"]) == set(expected_c) - assert v["expression_hgvs_g"] == expected_g def test_categorical_variation_rules( @@ -313,16 +312,16 @@ def test_categorical_variation_rules( civic_mpid12, ): """Verify property and relationship rules for Categorical Variation nodes.""" - check_unique_property("CategoricalVariation", "id") + check_unique_property("CategoricalVariant", "id") check_relation_count( - "CategoricalVariation", "Variation", "HAS_DEFINING_CONTEXT", max_rels=1 + "CategoricalVariant", "Variation", "HAS_DEFINING_CONTEXT", max_rels=1 ) check_relation_count( - "CategoricalVariation", "Variation", "HAS_MEMBERS", min_rels=0, max_rels=None + "CategoricalVariant", "Variation", "HAS_MEMBERS", min_rels=0, max_rels=None ) - expected_node_labels = [{"CategoricalVariation", "ProteinSequenceConsequence"}] - check_node_labels("CategoricalVariation", expected_node_labels, 1) + expected_node_labels = [{"CategoricalVariant", "Variation"}] + check_node_labels("CategoricalVariant", expected_node_labels, 1) cv = get_node_by_id(civic_mpid12["id"]) assert set(cv.keys()) == { @@ -414,7 +413,7 @@ def test_therapeutic_procedure_rules( # through CombinationTherapy and TherapeuticSubstituteGroup check_relation_count( "TherapeuticProcedure", - "Study", + "Statement", "HAS_THERAPEUTIC", min_rels=0, max_rels=None, @@ -424,7 +423,11 @@ def test_therapeutic_procedure_rules( "CombinationTherapy", "TherapeuticAgent", "HAS_COMPONENTS", max_rels=None ) check_relation_count( - "CombinationTherapy", "Study", "HAS_THERAPEUTIC", max_rels=None, direction="in" + "CombinationTherapy", + "Statement", + "HAS_THERAPEUTIC", + max_rels=None, + direction="in", ) check_relation_count( "TherapeuticSubstituteGroup", @@ -434,7 +437,7 @@ def test_therapeutic_procedure_rules( ) check_relation_count( "TherapeuticSubstituteGroup", - "Study", + "Statement", "HAS_THERAPEUTIC", max_rels=None, direction="in", @@ -489,7 +492,7 @@ def test_condition_rules( """Verify property and relationship rules for condition nodes.""" check_unique_property("Condition", "id") check_relation_count( - "Condition", "Study", "HAS_TUMOR_TYPE", max_rels=None, direction="in" + "Condition", "Statement", "HAS_TUMOR_TYPE", max_rels=None, direction="in" ) expected_node_labels = [{"Disease", "Condition"}] @@ -511,21 +514,21 @@ def test_study_rules( civic_eid2997_study, check_node_props, ): - """Verify property and relationship rules for Study nodes.""" - check_unique_property("Study", "id") + """Verify property and relationship rules for Statement nodes.""" + check_unique_property("Statement", "id") - check_relation_count("Study", "CategoricalVariation", "HAS_VARIANT") - check_relation_count("Study", "Condition", "HAS_TUMOR_TYPE") - check_relation_count("Study", "TherapeuticProcedure", "HAS_THERAPEUTIC") - check_relation_count("Study", "Coding", "HAS_STRENGTH") - check_relation_count("Study", "Method", "IS_SPECIFIED_BY", max_rels=None) - check_relation_count("Study", "Gene", "HAS_GENE_CONTEXT", max_rels=None) + check_relation_count("Statement", "CategoricalVariant", "HAS_VARIANT") + check_relation_count("Statement", "Condition", "HAS_TUMOR_TYPE") + check_relation_count("Statement", "TherapeuticProcedure", "HAS_THERAPEUTIC") + check_relation_count("Statement", "Coding", "HAS_STRENGTH") + check_relation_count("Statement", "Method", "IS_SPECIFIED_BY", max_rels=None) + check_relation_count("Statement", "Gene", "HAS_GENE_CONTEXT", max_rels=None) - expected_node_labels = [{"Study", "VariantTherapeuticResponseStudy"}] - check_node_labels("Study", expected_node_labels, 1) + expected_node_labels = [{"Statement", "VariantTherapeuticResponseStudyStatement"}] + check_node_labels("Statement", expected_node_labels, 1) cite_query = """ - MATCH (s:Study) + MATCH (s:Statement) OPTIONAL MATCH (s)-[:IS_REPORTED_IN]->(d:Document) WITH s, COUNT(d) as d_count WHERE d_count < 1 @@ -541,12 +544,12 @@ def test_study_rules( "description", "direction", "predicate", - "alleleOrigin", + "alleleOriginQualifier", "type", } civic_eid2997_study_cp = civic_eid2997_study.copy() - civic_eid2997_study_cp["alleleOrigin"] = civic_eid2997_study_cp["qualifiers"][ - "alleleOrigin" + civic_eid2997_study_cp["alleleOriginQualifier"] = civic_eid2997_study_cp[ + "alleleOriginQualifier" ] check_node_props(study, civic_eid2997_study_cp, expected_keys) @@ -564,7 +567,12 @@ def test_document_rules( """Verify property and relationship rules for Document nodes.""" check_unique_property("Document", "id") check_relation_count( - "Document", "Study", "IS_REPORTED_IN", min_rels=0, max_rels=None, direction="in" + "Document", + "Statement", + "IS_REPORTED_IN", + min_rels=0, + max_rels=None, + direction="in", ) expected_labels = [{"Document"}] @@ -573,7 +581,7 @@ def test_document_rules( # PMIDs: 31779674 and 35121878 do not have this relationship is_reported_in_query = """ MATCH (s:Document) - OPTIONAL MATCH (s)<-[:IS_REPORTED_IN]-(d:Study) + OPTIONAL MATCH (s)<-[:IS_REPORTED_IN]-(d:Statement) WITH s, COUNT(d) as d_count WHERE (d_count < 1) AND (s.pmid <> 31779674) AND (s.pmid <> 35121878) RETURN COUNT(s) @@ -594,7 +602,7 @@ def test_document_rules( doc = get_node_by_id(moa_source45["id"]) extension_names = {"source_type"} check_extension_props(doc, moa_source45["extensions"], extension_names) - expected_keys = {"id", "title", "doi", "source_type", "url", "pmid"} + expected_keys = {"id", "title", "doi", "source_type", "urls", "pmid"} check_node_props(doc, moa_source45, expected_keys, extension_names) @@ -609,7 +617,7 @@ def test_method_rules( """Verify property and relationship rules for Method nodes.""" check_unique_property("Method", "id") check_relation_count( - "Method", "Study", "IS_SPECIFIED_BY", max_rels=None, direction="in" + "Method", "Statement", "IS_SPECIFIED_BY", max_rels=None, direction="in" ) expected_node_labels = [{"Method"}] @@ -626,7 +634,7 @@ def test_no_lost_nodes(driver: Driver): labels_query = """ MATCH (n) WHERE size(labels(n)) = 0 - AND NOT (n)<-[:IS_REPORTED_IN]-(:Study) + AND NOT (n)<-[:IS_REPORTED_IN]-(:Statement) RETURN COUNT(n) """ with driver.session() as s: