From ea42de72ac742f228eb6befac722920e0ba4b725 Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Fri, 19 Jan 2024 16:25:05 -0500 Subject: [PATCH] add changes to files --- .gitignore | 3 +- .../harvester/civic_harvester_example.py | 59 +- .../transform/civic_transform_example.py | 111 ++- analysis/graph/db_helper.py | 10 +- analysis/graph/missing_diseases_counts.txt | 2 + analysis/graph/missing_therapies_counts.txt | 130 +++ analysis/graph/missing_variants_counts.txt | 130 +++ analysis/graph/no_ncit_disease_counts.txt | 9 + .../unsupported_variants_civic_counts.txt | 939 ++++++++++++++++++ .../unsupported_variants_hgvs_counts.txt | 8 + .../harvester/moa_harvester_example.py | 52 +- .../transform/moa_transform_example.py | 79 +- .../examples/harvester/pharmgkb_harvester.py | 7 + .../initial exploratory analysis.ipynb | 151 +++ analysis/pharmgkb/notes.md | 41 + metakb/__init__.py | 16 +- metakb/cli.py | 274 ++--- metakb/database.py | 340 ++++--- metakb/delta.py | 69 +- metakb/exceptions.py | 2 +- metakb/harvesters/__init__.py | 4 - metakb/harvesters/base.py | 10 +- metakb/harvesters/civic.py | 16 +- metakb/harvesters/moa.py | 101 +- metakb/harvesters/oncokb.py | 52 +- metakb/main.py | 110 +- metakb/normalizers.py | 103 +- metakb/query.py | 553 ++++++----- metakb/schemas.py | 584 +++++------ metakb/transform/__init__.py | 10 +- metakb/transform/base.py | 80 +- metakb/transform/civic.py | 539 +++++----- metakb/transform/moa.py | 314 +++--- metakb/transform/oncokb.py | 203 ++-- pyproject.toml | 3 + setup.py | 2 +- tests/conftest.py | 898 +++++++---------- tests/unit/database/test_database.py | 135 ++- tests/unit/deltas/test_civic_deltas.py | 188 ++-- tests/unit/deltas/test_moa_deltas.py | 185 ++-- .../harvesters/moa/test_moa_assertions.py | 27 +- tests/unit/harvesters/moa/test_moa_harvest.py | 9 +- tests/unit/harvesters/moa/test_moa_source.py | 14 +- .../harvesters/oncokb/test_oncokb_harvest.py | 27 +- tests/unit/harvesters/test_base_class.py | 6 +- tests/unit/harvesters/test_civic_harvester.py | 12 +- tests/unit/setup/test_minimal_setup.py | 2 +- tests/unit/test_query.py | 812 +++++++++------ tests/unit/test_search_statements.py | 214 ++-- .../test_civic_transform_diagnostic.py | 66 +- .../test_civic_transform_prognostic.py | 54 +- .../test_civic_transform_therapeutic.py | 56 +- tests/unit/transform/test_moa_transform.py | 59 +- tests/unit/transform/test_oncokb_transform.py | 85 +- 54 files changed, 4937 insertions(+), 3028 deletions(-) create mode 100644 analysis/graph/missing_diseases_counts.txt create mode 100644 analysis/graph/missing_therapies_counts.txt create mode 100644 analysis/graph/missing_variants_counts.txt create mode 100644 analysis/graph/no_ncit_disease_counts.txt create mode 100644 analysis/graph/unsupported_variants_civic_counts.txt create mode 100644 analysis/graph/unsupported_variants_hgvs_counts.txt create mode 100644 analysis/pharmgkb/examples/harvester/pharmgkb_harvester.py create mode 100644 analysis/pharmgkb/initial exploratory analysis.ipynb create mode 100644 analysis/pharmgkb/notes.md diff --git a/.gitignore b/.gitignore index c74f2dbe..4abf42a6 100644 --- a/.gitignore +++ b/.gitignore @@ -130,7 +130,6 @@ analysis/graph/*.ipynb # Build files Pipfile.lock -pyproject.toml # DynamoDB dynamodb_local_latest/ @@ -138,4 +137,4 @@ dynamodb_local_latest/ # Zip *.zip -notebooks \ No newline at end of file +notebooks diff --git a/analysis/civic/examples/harvester/civic_harvester_example.py b/analysis/civic/examples/harvester/civic_harvester_example.py index 1ed15c18..345cc119 100644 --- a/analysis/civic/examples/harvester/civic_harvester_example.py +++ b/analysis/civic/examples/harvester/civic_harvester_example.py @@ -8,32 +8,35 @@ def create_evidence_examples(data): """Create five CIViC evidence examples.""" evidence_items = list() - for i in range(len(data['evidence'])): - if data['evidence'][i]['assertions']: - evidence_items.append(data['evidence'][i]) + for i in range(len(data["evidence"])): + if data["evidence"][i]["assertions"]: + evidence_items.append(data["evidence"][i]) if len(evidence_items) == 6: break for evidence_item in evidence_items: - variant_id = evidence_item['variant_id'] - gene_id = evidence_item['gene_id'] - assertions = evidence_item['assertions'] + variant_id = evidence_item["variant_id"] + gene_id = evidence_item["gene_id"] + assertions = evidence_item["assertions"] - for v in data['variants']: - if v['id'] == variant_id: + for v in data["variants"]: + if v["id"] == variant_id: variant = v - for g in data['genes']: - if g['id'] == gene_id: + for g in data["genes"]: + if g["id"] == gene_id: gene = g - with open(f"{PROJECT_ROOT}/analysis/civic/examples/harvester/" - f"{evidence_item['name']}.json", 'w+') as f: + with open( + f"{PROJECT_ROOT}/analysis/civic/examples/harvester/" + f"{evidence_item['name']}.json", + "w+", + ) as f: example = { - 'EVIDENCE': evidence_item, - 'GENE': gene, - 'VARIANT': variant, - 'ASSERTIONS': assertions + "EVIDENCE": evidence_item, + "GENE": gene, + "VARIANT": variant, + "ASSERTIONS": assertions, } json.dump(example, f, indent=4) @@ -45,26 +48,30 @@ def create_variant_examples(data): """ variants_ids = [12, 1, 221, 190] variants = list() - for i in range(len(data['variants'])): - if data['variants'][i]['id'] in variants_ids: - variants.append(data['variants'][i]) + for i in range(len(data["variants"])): + if data["variants"][i]["id"] in variants_ids: + variants.append(data["variants"][i]) for variant in variants: - with open(f"{PROJECT_ROOT}/analysis/civic/examples/harvester/" - f"{variant['name'].lower()}.json", 'w+') as f: - variant['evidence_items'] = variant['evidence_items'][0] + with open( + f"{PROJECT_ROOT}/analysis/civic/examples/harvester/" + f"{variant['name'].lower()}.json", + "w+", + ) as f: + variant["evidence_items"] = variant["evidence_items"][0] f.write(json.dumps(variant, indent=4)) -if __name__ == '__main__': +if __name__ == "__main__": c = CIViCHarvester() c.harvest() - latest = sorted((APP_ROOT / "data" / "civic" / "harvester").glob("civic_harvester_*.json"))[-1] # noqa: E501 + latest = sorted( + (APP_ROOT / "data" / "civic" / "harvester").glob("civic_harvester_*.json") + )[-1] with open(latest, "r") as f: civic_data = json.load(f) - civic_ex_dir =\ - PROJECT_ROOT / 'analysis' / 'civic' / 'examples' / 'harvester' + civic_ex_dir = PROJECT_ROOT / "analysis" / "civic" / "examples" / "harvester" civic_ex_dir.mkdir(exist_ok=True, parents=True) create_evidence_examples(civic_data) diff --git a/analysis/civic/examples/transform/civic_transform_example.py b/analysis/civic/examples/transform/civic_transform_example.py index 5a9aadab..4fabcd1a 100644 --- a/analysis/civic/examples/transform/civic_transform_example.py +++ b/analysis/civic/examples/transform/civic_transform_example.py @@ -1,28 +1,29 @@ """Create an example json file for CIViC Transform.""" import json -from metakb import PROJECT_ROOT, APP_ROOT +from metakb import APP_ROOT, PROJECT_ROOT from metakb.transform import CIViCTransform def create_civic_example(civic_data): """Create CIViC transform examples from list of evidence items.""" ex = { - 'statements': [], - 'propositions': [], - 'variation_descriptors': [], - 'gene_descriptors': [], - 'therapy_descriptors': [], - 'disease_descriptors': [], - 'methods': [], - 'documents': [] + "statements": [], + "propositions": [], + "variation_descriptors": [], + "gene_descriptors": [], + "therapy_descriptors": [], + "disease_descriptors": [], + "methods": [], + "documents": [], } supported_by_statement_ids = set() - for s in civic_data['statements']: - if s['id'] == 'civic.aid:6': - supported_by_statement_ids = \ - {s for s in s['supported_by'] if s.startswith('civic.eid')} - supported_by_statement_ids.add(s['id']) + for s in civic_data["statements"]: + if s["id"] == "civic.aid:6": + supported_by_statement_ids = { + s for s in s["supported_by"] if s.startswith("civic.eid") + } + supported_by_statement_ids.add(s["id"]) break proposition_ids = set() @@ -32,56 +33,66 @@ def create_civic_example(civic_data): gids = set() methods = set() documents = set() - for s in civic_data['statements']: - if s['id'] in supported_by_statement_ids: - ex['statements'].append(s) - proposition_ids.add(s['proposition']) - vids.add(s['variation_descriptor']) - tids.add(s['therapy_descriptor']) - dids.add(s['disease_descriptor']) - methods.add(s['method']) - documents.update({d for d in s['supported_by'] if - not d.startswith('civic.eid')}) + for s in civic_data["statements"]: + if s["id"] in supported_by_statement_ids: + ex["statements"].append(s) + proposition_ids.add(s["proposition"]) + vids.add(s["variation_descriptor"]) + tids.add(s["therapy_descriptor"]) + dids.add(s["disease_descriptor"]) + methods.add(s["method"]) + documents.update( + {d for d in s["supported_by"] if not d.startswith("civic.eid")} + ) - for p in civic_data['propositions']: - if p['id'] in proposition_ids: - ex['propositions'].append(p) + for p in civic_data["propositions"]: + if p["id"] in proposition_ids: + ex["propositions"].append(p) - for v in civic_data['variation_descriptors']: - if v['id'] in vids: - ex['variation_descriptors'].append(v) - gids.add(v['gene_context']) + for v in civic_data["variation_descriptors"]: + if v["id"] in vids: + ex["variation_descriptors"].append(v) + gids.add(v["gene_context"]) - for t in civic_data['therapy_descriptors']: - if t['id'] in tids: - ex['therapy_descriptors'].append(t) + for t in civic_data["therapy_descriptors"]: + if t["id"] in tids: + ex["therapy_descriptors"].append(t) - for d in civic_data['disease_descriptors']: - if d['id'] in dids: - ex['disease_descriptors'].append(d) + for d in civic_data["disease_descriptors"]: + if d["id"] in dids: + ex["disease_descriptors"].append(d) - for g in civic_data['gene_descriptors']: - if g['id'] in gids: - ex['gene_descriptors'].append(g) + for g in civic_data["gene_descriptors"]: + if g["id"] in gids: + ex["gene_descriptors"].append(g) - for m in civic_data['methods']: - if m['id'] in methods: - ex['methods'].append(m) + for m in civic_data["methods"]: + if m["id"] in methods: + ex["methods"].append(m) - for d in civic_data['documents']: - if d['id'] in documents: - ex['documents'].append(d) + for d in civic_data["documents"]: + if d["id"] in documents: + ex["documents"].append(d) - with open(PROJECT_ROOT / "analysis" / "civic" / "examples" / # noqa: W504 - "transform" / "civic_cdm_example.json", 'w+') as f2: + with open( + PROJECT_ROOT + / "analysis" + / "civic" + / "examples" + / "transform" + / "civic_cdm_example.json", + "w+", + ) as f2: json.dump(ex, f2, indent=4) -if __name__ == '__main__': +if __name__ == "__main__": civic = CIViCTransform() civic.transform() civic.create_json() - latest = sorted((APP_ROOT / "data" / "civic" / "transform").glob("civic_cdm_*.json"))[-1] # noqa: E501 + latest = sorted( + (APP_ROOT / "data" / "civic" / "transform").glob("civic_cdm_*.json") + )[-1] with open(latest, "r") as f: civic_data = json.load(f) create_civic_example(civic_data) diff --git a/analysis/graph/db_helper.py b/analysis/graph/db_helper.py index 3985457b..adb8622b 100644 --- a/analysis/graph/db_helper.py +++ b/analysis/graph/db_helper.py @@ -1,19 +1,19 @@ """Utility function to load/reload graph for development.""" -from metakb.database import Graph -from metakb import APP_ROOT import json +from metakb import APP_ROOT +from metakb.database import Graph g = Graph(uri="bolt://localhost:7687", credentials=("neo4j", "admin")) g.clear() -fpath = APP_ROOT / 'data' / 'civic' / 'transform' / 'civic_cdm.json' -with open(fpath, 'r') as f: +fpath = APP_ROOT / "data" / "civic" / "transform" / "civic_cdm.json" +with open(fpath, "r") as f: items = json.load(f) count = 0 for item in items: - if 'assertion' in item.keys(): + if "assertion" in item.keys(): continue else: g.add_transformed_data(item) diff --git a/analysis/graph/missing_diseases_counts.txt b/analysis/graph/missing_diseases_counts.txt new file mode 100644 index 00000000..856828cd --- /dev/null +++ b/analysis/graph/missing_diseases_counts.txt @@ -0,0 +1,2 @@ +TALL and T-Cell Acute Lymphoid Leukemia, 6 +T-Cell Acute Lymphoid Leukemia, 6 diff --git a/analysis/graph/missing_therapies_counts.txt b/analysis/graph/missing_therapies_counts.txt new file mode 100644 index 00000000..6dfd105e --- /dev/null +++ b/analysis/graph/missing_therapies_counts.txt @@ -0,0 +1,130 @@ +None, 184 +Platinum, 28 +Radiation therapy, 22 +Selumetinib, 21 +Dabrafenib + Trametinib, 18 +Bevacizumab + Olaparib, 18 +ncit:None and PI3Ka/Di, 9 +Durvalumab, 8 +Entrectinib, 7 +Erdafitinib, 6 +Proton-based SBRT, 6 +ncit:None and PI-103, 6 +Tazemetostat, 5 +ncit:C15632 and Chemotherapy, 5 +ncit:None and JQ1, 5 +ncit:None and SU11274, 5 +Gamma knife, 4 +U0126, 4 +MK-2206, 4 +Ipilimumab + Vemurafenib, 4 +Capmatinib, 4 +Alkylating chemotherapy, 4 +Neoadjuvant chemoradiation, 4 +nutlin-3, 4 +Talazoparib, 4 +Lapatinib + Trastuzumab, 4 +Neoadjuvant chemotherapy + surgery, 4 +ncit:None and PD173074, 4 +Larotrectinib, 3 +Selpercatinib, 3 +Pralsetinib, 3 +ncit:None and RDEA 119, 3 +ncit:None and EAP Protocol, 3 +JQ1, 2 +Pemigatinib, 2 +Azacitidine + Panobinostat, 2 +Encorafenib, 2 +Cetuximab + Encorafenib, 2 +Cobimetinib + Vemurafenib, 2 +Selumetinib + Vemurafenib, 2 +Mito-CP, 2 +Trametinib + Vemurafenib, 2 +Interferon-alpha + Ixazomib, 2 +GANT61, 2 +GANT61 + Obatoclax, 2 +Neratinib + Vemurafenib, 2 +Cetuximab + Vemurafenib, 2 +Nivolumab + Pembrolizumab, 2 +Durvalumab + Osimertinib, 2 +Durvalumab + Gefitinib, 2 +Ado-Trastuzumab Emtansine, 2 +Letrozole + Tamoxifen, 2 +Palbociclib + Trametinib, 2 +Buparlisib + Trametinib, 2 +FGFR1 inhibitor + Trametinib, 2 +FGFR1 inhibitor + Trametinib , 2 +SBRT, 2 +SU11274, 2 +EXEL-8232, 2 +Chemotherapy + Pembrolizumab, 2 +EPZ015666, 2 +Pertuzumab + Trastuzumab, 2 +Chemotherapy + Trastuzumab, 2 +Capecitabine + Trastuzumab + Tucatinib, 2 +Chemotherapy + Hyaluronidase-zzxf + Pertuzumab + Trastuzumab, 2 +Docetaxel + Hyaluronidase-zzxf + Pertuzumab + Trastuzumab, 2 +Margetuximab-cmkb + Chemotherapy, 2 +Carbogen and nicotinamide + radiotherapy, 2 +Radical radiotherapy, 2 +Ipilimumab + Nivolumab, 2 +ncit:None and SU5614, 2 +ncit:None and AGI-5198, 2 +ncit:C15360 and Adjuvant Chemotherapy, 2 +ncit:None and Spliceostatin A, 2 +ncit:None and GSK126, 2 +ncit:None and PI103, 2 +ncit:None and BPTES, 2 +ncit:None and JQ-1, 2 +ncit:None and U0126, 2 +ncit:C154898 and Inhibitor, 2 +ncit:None and R3Mab, 2 +ncit:None and FGF/VEGF Receptor Tyrosine Kinase Inhibitor, PD173074, 2 +Lorlatinib, 1 +Omacetaxine, 1 +Dacomitinib, 1 +Ivosidenib, 1 +Alpelisib, 1 +Atezolizumab, 1 +ncit:None and Anti-CD33, 1 +ncit:None and NSC348884, 1 +ncit:C158876 and Induction Therapy, 1 +ncit:None and JW55, 1 +ncit:None and G007-LK, 1 +ncit:C11197 and FOLFOX Regimen, 1 +ncit:None and GSK321, 1 +ncit:None and SB202190, 1 +ncit:None and AKTi-1/2, 1 +ncit:None and WZ4002, 1 +ncit:None and JSI-124, 1 +ncit:None and NOTCH1 Antibody (PF-06293622), 1 +ncit:None and RG7356, 1 +ncit:None and GNE-617, 1 +ncit:None and ACLY SiRNA, 1 +ncit:C1490 and Taxane Compound, 1 +ncit:None and IGF1R Monoclonal Antibody, 1 +ncit:None and OICR-9429, 1 +ncit:None and AZD5438, 1 +ncit:None and WHI-P154, 1 +ncit:None and 2,4-pyrimidinediamine, 1 +ncit:None and AZD3463, 1 +ncit:None and A66, 1 +ncit:None and Tgx 221, 1 +ncit:None and JQ1 Compound, 1 +ncit:None and GPI-15427, 1 +ncit:None and C646, 1 +ncit:None and UO126, 1 +ncit:None and UNC1062, 1 +ncit:C15313 and Radiation Therapy, 1 +ncit:C2462 and Radioactive Iodine, 1 +ncit:None and NVP-AEW541, 1 +ncit:None and ARS-1620, 1 +ncit:None and ALK2 Inhibitor LDN-193189, 1 +ncit:None and JQEZ5, 1 +ncit:None and CHZ868, 1 +ncit:None and JAK Inhibitor I, 1 +ncit:None and DS-6501b, 1 +ncit:None and MAP855, 1 +ncit:None and EPZ004777, 1 +ncit:None and CHMFL-KIT-031, 1 +ncit:None and EPZ011989, 1 diff --git a/analysis/graph/missing_variants_counts.txt b/analysis/graph/missing_variants_counts.txt new file mode 100644 index 00000000..9a7550b7 --- /dev/null +++ b/analysis/graph/missing_variants_counts.txt @@ -0,0 +1,130 @@ +BRCA1 (Pathogenic), 16 +BRCA2, 14 +KRAS, 11 +BRCA2 (Pathogenic), 11 +BCR--ABL1 Fusion, 10 +KIT, 10 +BRCA1, 9 +EGFR, 9 +PTEN Deletion, 7 +ERBB2 Amplification, 5 +PTEN (Nonsense), 4 +CDK4 Amplification, 4 +FGFR1 Amplification, 4 +FGFR2 Amplification, 4 +MET Amplification, 4 +MSI-High, 4 +ALK Fusion, 3 +ALK Translocation, 3 +TMPRSS2--ERG Fusion, 3 +BRAF, 3 +BRCA1 (Oncogenic Mutations), 3 +BRCA2 (Oncogenic Mutations), 3 +FLCN (Nonsense), 3 +FLCN (Frameshift), 3 +KIT (Oncogenic Mutations), 3 +PTEN (Frameshift), 3 +PTEN (Splice Site), 3 +AR Amplification, 3 +ALK, 2 +PDGFRA, 2 +PDGFRB Translocation, 2 +RET, 2 +EGFR (Nonsense), 2 +ERCC2 (Missense), 2 +MLH3, 2 +MSH2, 2 +MSH6, 2 +NRAS, 2 +POLD1, 2 +POLE, 2 +TSC1 (Nonsense), 2 +TSC1 (Frameshift), 2 +TSC2 (Nonsense), 2 +TSC2 (Frameshift), 2 +COSMIC Signature 10, 2 +COSMIC Signature 3, 2 +COSMIC Signature 4, 2 +High, 2 +High (>= 100.0 mutations), 2 +RAD17 (shRNA), 2 +CCND1 Translocation, 1 +CCND3 Translocation, 1 +COL1A1--PDGFB Fusion, 1 +EML4--ALK Fusion, 1 +ESRP1--RAF1 Fusion, 1 +FGFR2--TACC3 Fusion, 1 +FGFR3--NSD2 Fusion, 1 +FIP1L1--PDGFRA Fusion, 1 +BCR--PDGFRA Fusion, 1 +PDGFRB, 1 +RET Fusion, 1 +ROS1 Fusion, 1 +SLC45A3--BRAF Fusion, 1 +ABL1 (Missense), 1 +ABL1, 1 +ATM (Frameshift), 1 +ATM (Splice Site), 1 +ATM (Nonsense), 1 +ATM, 1 +BARD1, 1 +BCR, 1 +BRIP1, 1 +CDK12, 1 +CHEK1, 1 +CHEK2, 1 +EGFR (Insertion), 1 +EGFR (Deletion), 1 +ERBB2 (Missense), 1 +ERBB3 (Missense), 1 +FANCL, 1 +FBXW7 (Missense), 1 +MET (Splice Site), 1 +MET (Deletion), 1 +MET (Nonsense), 1 +MTOR, 1 +MYH, 1 +PALB2, 1 +PBRM1 (Nonsense), 1 +PBRM1 (Frameshift), 1 +PIK3CA, 1 +PIK3CB, 1 +RAD51B, 1 +RAD51C, 1 +RAD51D, 1 +RAD54L, 1 +SMARCA4 (Nonsense), 1 +SMARCA4 (Frameshift), 1 +SMARCA4 (Splice Site), 1 +TET2, 1 +TP53 (Nonsense), 1 +TP53 (Frameshift), 1 +TP53 (Splice Site), 1 +TSC1 (Oncogenic Mutations), 1 +TSC1, 1 +TSC2, 1 +TSC2 (Oncogenic Mutations), 1 +ATM (Pathogenic), 1 +AURKA Amplification, 1 +AURKB Amplification, 1 +BRAF Amplification, 1 +BRCA2 Deletion, 1 +CCND1 Amplification, 1 +CCNE1 Amplification, 1 +CD274 Amplification, 1 +CDKN2A Deletion, 1 +CDKN2C Deletion, 1 +CRKL Amplification, 1 +EGFR Amplification, 1 +FBXW7 Deletion, 1 +KIT Amplification, 1 +PAK1 Amplification, 1 +PBRM1 Deletion, 1 +PIK3CA Amplification, 1 +COSMIC Signature 5, 1 +High (>= 178.0 mutations), 1 +High (>= 10.0 mutations/Mb), 1 +ATM (shRNA), 1 +CDK12 (shRNA), 1 +CDK12 (siRNA), 1 +RAD50 (shRNA), 1 diff --git a/analysis/graph/no_ncit_disease_counts.txt b/analysis/graph/no_ncit_disease_counts.txt new file mode 100644 index 00000000..c74698e9 --- /dev/null +++ b/analysis/graph/no_ncit_disease_counts.txt @@ -0,0 +1,9 @@ +doid:0060108, Brain Glioma,13 +doid:5603, T-cell Acute Lymphoblastic Leukemia,11 +doid:4450, Renal Cell Carcinoma,1 +doid:0060474, Chuvash Polycythemia,1 +doid:0060075, Estrogen-receptor Positive Breast Cancer,1 +doid:0080797, Nasal Type Extranodal NK/T-cell Lymphoma,1 +doid:0111278, Histiocytosis-Lymphadenopathy Plus Syndrome,1 +doid:0080808, Mammary Analogue Secretory Carcinoma,1 +doid:0080684, Diffuse Midline Glioma, H3 K27M-mutant,1 diff --git a/analysis/graph/unsupported_variants_civic_counts.txt b/analysis/graph/unsupported_variants_civic_counts.txt new file mode 100644 index 00000000..f50d053a --- /dev/null +++ b/analysis/graph/unsupported_variants_civic_counts.txt @@ -0,0 +1,939 @@ +civic:vid499, 41 +civic:vid306, 38 +civic:vid311, 25 +civic:vid214, 24 +civic:vid1002, 22 +civic:vid336, 20 +civic:vid186, 19 +civic:vid185, 18 +civic:vid55, 17 +civic:vid270, 16 +civic:vid324, 15 +civic:vid133, 14 +civic:vid442, 14 +civic:vid267, 13 +civic:vid801, 13 +civic:vid419, 11 +civic:vid925, 10 +civic:vid1206, 10 +civic:vid2681, 10 +civic:vid66, 9 +civic:vid17, 9 +civic:vid269, 9 +civic:vid2202, 9 +civic:vid1012, 9 +civic:vid1687, 9 +civic:vid1876, 9 +civic:vid132, 8 +civic:vid178, 8 +civic:vid222, 8 +civic:vid18, 8 +civic:vid208, 8 +civic:vid509, 8 +civic:vid875, 8 +civic:vid77, 7 +civic:vid131, 7 +civic:vid193, 7 +civic:vid272, 7 +civic:vid212, 7 +civic:vid314, 7 +civic:vid414, 7 +civic:vid276, 7 +civic:vid554, 7 +civic:vid587, 7 +civic:vid726, 7 +civic:vid890, 7 +civic:vid1003, 7 +civic:vid75, 6 +civic:vid1, 6 +civic:vid312, 6 +civic:vid399, 6 +civic:vid437, 6 +civic:vid503, 6 +civic:vid574, 6 +civic:vid618, 6 +civic:vid354, 6 +civic:vid621, 6 +civic:vid1010, 6 +civic:vid5, 5 +civic:vid173, 5 +civic:vid202, 5 +civic:vid289, 5 +civic:vid20, 5 +civic:vid190, 5 +civic:vid429, 5 +civic:vid465, 5 +civic:vid58, 5 +civic:vid961, 5 +civic:vid1004, 5 +civic:vid1006, 5 +civic:vid1007, 5 +civic:vid1008, 5 +civic:vid1009, 5 +civic:vid1011, 5 +civic:vid1255, 5 +civic:vid570, 5 +civic:vid519, 5 +civic:vid94, 4 +civic:vid86, 4 +civic:vid125, 4 +civic:vid108, 4 +civic:vid268, 4 +civic:vid298, 4 +civic:vid315, 4 +civic:vid510, 4 +civic:vid513, 4 +civic:vid516, 4 +civic:vid535, 4 +civic:vid629, 4 +civic:vid800, 4 +civic:vid1301, 4 +civic:vid977, 4 +civic:vid645, 4 +civic:vid1497, 4 +civic:vid1515, 4 +civic:vid1579, 4 +civic:vid1630, 4 +civic:vid1631, 4 +civic:vid1632, 4 +civic:vid2371, 4 +civic:vid56, 3 +civic:vid25, 3 +civic:vid85, 3 +civic:vid170, 3 +civic:vid181, 3 +civic:vid273, 3 +civic:vid597, 3 +civic:vid313, 3 +civic:vid347, 3 +civic:vid352, 3 +civic:vid436, 3 +civic:vid221, 3 +civic:vid500, 3 +civic:vid388, 3 +civic:vid511, 3 +civic:vid160, 3 +civic:vid736, 3 +civic:vid797, 3 +civic:vid827, 3 +civic:vid839, 3 +civic:vid698, 3 +civic:vid949, 3 +civic:vid953, 3 +civic:vid962, 3 +civic:vid964, 3 +civic:vid967, 3 +civic:vid1277, 3 +civic:vid1558, 3 +civic:vid1445, 3 +civic:vid2902, 3 +civic:vid1569, 3 +civic:vid1691, 3 +civic:vid2286, 3 +civic:vid2992, 3 +civic:vid2375, 3 +civic:vid1690, 3 +civic:vid2693, 3 +civic:vid2794, 3 +civic:vid2906, 3 +civic:vid32, 2 +civic:vid101, 2 +civic:vid124, 2 +civic:vid41, 2 +civic:vid156, 2 +civic:vid282, 2 +civic:vid285, 2 +civic:vid286, 2 +civic:vid287, 2 +civic:vid300, 2 +civic:vid301, 2 +civic:vid309, 2 +civic:vid318, 2 +civic:vid335, 2 +civic:vid361, 2 +civic:vid365, 2 +civic:vid377, 2 +civic:vid382, 2 +civic:vid396, 2 +civic:vid405, 2 +civic:vid413, 2 +civic:vid462, 2 +civic:vid487, 2 +civic:vid507, 2 +civic:vid506, 2 +civic:vid505, 2 +civic:vid518, 2 +civic:vid520, 2 +civic:vid526, 2 +civic:vid532, 2 +civic:vid553, 2 +civic:vid586, 2 +civic:vid602, 2 +civic:vid606, 2 +civic:vid626, 2 +civic:vid630, 2 +civic:vid633, 2 +civic:vid485, 2 +civic:vid637, 2 +civic:vid187, 2 +civic:vid666, 2 +civic:vid693, 2 +civic:vid24, 2 +civic:vid695, 2 +civic:vid696, 2 +civic:vid817, 2 +civic:vid573, 2 +civic:vid782, 2 +civic:vid830, 2 +civic:vid1854, 2 +civic:vid946, 2 +civic:vid954, 2 +civic:vid966, 2 +civic:vid2623, 2 +civic:vid2174, 2 +civic:vid206, 2 +civic:vid1105, 2 +civic:vid76, 2 +civic:vid3225, 2 +civic:vid369, 2 +civic:vid1293, 2 +civic:vid1577, 2 +civic:vid1550, 2 +civic:vid2650, 2 +civic:vid1446, 2 +civic:vid1498, 2 +civic:vid1544, 2 +civic:vid1548, 2 +civic:vid1551, 2 +civic:vid1567, 2 +civic:vid2900, 2 +civic:vid2901, 2 +civic:vid1658, 2 +civic:vid1665, 2 +civic:vid2214, 2 +civic:vid159, 2 +civic:vid1688, 2 +civic:vid1877, 2 +civic:vid653, 2 +civic:vid1878, 2 +civic:vid1880, 2 +civic:vid1257, 2 +civic:vid2171, 2 +civic:vid2178, 2 +civic:vid2203, 2 +civic:vid2204, 2 +civic:vid2205, 2 +civic:vid2209, 2 +civic:vid2289, 2 +civic:vid2340, 2 +civic:vid592, 2 +civic:vid2376, 2 +civic:vid2397, 2 +civic:vid2409, 2 +civic:vid2575, 2 +civic:vid2578, 2 +civic:vid2585, 2 +civic:vid2586, 2 +civic:vid2619, 2 +civic:vid2167, 2 +civic:vid2658, 2 +civic:vid2679, 2 +civic:vid2571, 2 +civic:vid2682, 2 +civic:vid2704, 2 +civic:vid2705, 2 +civic:vid2749, 2 +civic:vid2759, 2 +civic:vid2166, 2 +civic:vid2771, 2 +civic:vid2778, 2 +civic:vid2766, 2 +civic:vid2417, 2 +civic:vid2909, 2 +civic:vid2911, 2 +civic:vid2917, 2 +civic:vid2944, 2 +civic:vid2997, 2 +civic:vid2978, 2 +civic:vid3029, 2 +civic:vid3033, 2 +civic:vid3118, 2 +civic:vid3211, 2 +civic:vid3223, 2 +civic:vid3210, 2 +civic:vid3243, 2 +moa:vid1, 1 +moa:vid2, 1 +moa:vid3, 1 +moa:vid4, 1 +moa:vid5, 1 +moa:vid6, 1 +moa:vid7, 1 +moa:vid8, 1 +moa:vid9, 1 +moa:vid10, 1 +moa:vid11, 1 +moa:vid12, 1 +moa:vid14, 1 +moa:vid15, 1 +moa:vid16, 1 +moa:vid17, 1 +moa:vid18, 1 +moa:vid19, 1 +moa:vid21, 1 +moa:vid23, 1 +moa:vid24, 1 +moa:vid26, 1 +moa:vid27, 1 +moa:vid30, 1 +moa:vid35, 1 +moa:vid47, 1 +moa:vid48, 1 +moa:vid49, 1 +moa:vid50, 1 +moa:vid51, 1 +moa:vid52, 1 +moa:vid53, 1 +moa:vid54, 1 +moa:vid55, 1 +moa:vid60, 1 +moa:vid63, 1 +moa:vid65, 1 +moa:vid67, 1 +moa:vid68, 1 +moa:vid70, 1 +moa:vid77, 1 +moa:vid126, 1 +moa:vid127, 1 +moa:vid128, 1 +moa:vid133, 1 +moa:vid140, 1 +moa:vid145, 1 +moa:vid182, 1 +moa:vid183, 1 +moa:vid185, 1 +moa:vid191, 1 +moa:vid192, 1 +moa:vid193, 1 +moa:vid194, 1 +moa:vid195, 1 +moa:vid196, 1 +moa:vid197, 1 +moa:vid198, 1 +moa:vid200, 1 +moa:vid201, 1 +moa:vid202, 1 +moa:vid203, 1 +moa:vid207, 1 +moa:vid208, 1 +moa:vid209, 1 +moa:vid210, 1 +moa:vid211, 1 +moa:vid212, 1 +moa:vid213, 1 +moa:vid214, 1 +moa:vid215, 1 +moa:vid216, 1 +moa:vid217, 1 +moa:vid218, 1 +moa:vid221, 1 +moa:vid231, 1 +moa:vid232, 1 +moa:vid239, 1 +moa:vid246, 1 +moa:vid253, 1 +moa:vid254, 1 +moa:vid256, 1 +moa:vid257, 1 +moa:vid258, 1 +moa:vid262, 1 +moa:vid264, 1 +moa:vid266, 1 +moa:vid269, 1 +moa:vid277, 1 +moa:vid278, 1 +moa:vid281, 1 +moa:vid283, 1 +moa:vid285, 1 +moa:vid287, 1 +moa:vid288, 1 +moa:vid289, 1 +moa:vid301, 1 +moa:vid302, 1 +moa:vid307, 1 +moa:vid308, 1 +moa:vid309, 1 +moa:vid310, 1 +moa:vid311, 1 +moa:vid312, 1 +moa:vid338, 1 +moa:vid339, 1 +moa:vid340, 1 +moa:vid341, 1 +moa:vid344, 1 +moa:vid347, 1 +moa:vid349, 1 +moa:vid350, 1 +moa:vid358, 1 +moa:vid360, 1 +moa:vid361, 1 +moa:vid362, 1 +moa:vid363, 1 +moa:vid366, 1 +moa:vid367, 1 +moa:vid368, 1 +moa:vid369, 1 +moa:vid370, 1 +moa:vid371, 1 +moa:vid374, 1 +moa:vid381, 1 +moa:vid382, 1 +moa:vid383, 1 +moa:vid384, 1 +moa:vid399, 1 +moa:vid400, 1 +moa:vid401, 1 +moa:vid405, 1 +moa:vid407, 1 +moa:vid408, 1 +moa:vid409, 1 +moa:vid410, 1 +moa:vid412, 1 +moa:vid413, 1 +moa:vid441, 1 +moa:vid442, 1 +moa:vid443, 1 +moa:vid445, 1 +moa:vid447, 1 +moa:vid469, 1 +moa:vid474, 1 +moa:vid476, 1 +moa:vid478, 1 +moa:vid480, 1 +moa:vid482, 1 +moa:vid483, 1 +moa:vid484, 1 +moa:vid485, 1 +moa:vid486, 1 +moa:vid487, 1 +moa:vid489, 1 +moa:vid490, 1 +moa:vid491, 1 +moa:vid492, 1 +moa:vid494, 1 +moa:vid501, 1 +moa:vid508, 1 +moa:vid509, 1 +moa:vid544, 1 +moa:vid545, 1 +moa:vid546, 1 +moa:vid550, 1 +moa:vid552, 1 +moa:vid553, 1 +moa:vid554, 1 +moa:vid559, 1 +moa:vid560, 1 +moa:vid561, 1 +moa:vid562, 1 +moa:vid563, 1 +moa:vid564, 1 +moa:vid565, 1 +moa:vid566, 1 +moa:vid567, 1 +moa:vid568, 1 +moa:vid569, 1 +moa:vid570, 1 +moa:vid575, 1 +moa:vid582, 1 +moa:vid583, 1 +moa:vid584, 1 +moa:vid585, 1 +moa:vid586, 1 +moa:vid587, 1 +moa:vid588, 1 +moa:vid589, 1 +moa:vid590, 1 +moa:vid592, 1 +moa:vid594, 1 +moa:vid595, 1 +moa:vid597, 1 +moa:vid598, 1 +moa:vid599, 1 +moa:vid600, 1 +moa:vid601, 1 +moa:vid603, 1 +moa:vid604, 1 +moa:vid606, 1 +moa:vid607, 1 +moa:vid608, 1 +moa:vid609, 1 +moa:vid610, 1 +moa:vid611, 1 +moa:vid612, 1 +moa:vid613, 1 +moa:vid614, 1 +moa:vid615, 1 +moa:vid616, 1 +moa:vid618, 1 +moa:vid639, 1 +moa:vid644, 1 +moa:vid646, 1 +moa:vid675, 1 +moa:vid676, 1 +moa:vid677, 1 +moa:vid680, 1 +moa:vid682, 1 +moa:vid684, 1 +moa:vid685, 1 +moa:vid687, 1 +moa:vid688, 1 +moa:vid689, 1 +moa:vid692, 1 +moa:vid693, 1 +moa:vid694, 1 +moa:vid695, 1 +moa:vid697, 1 +moa:vid699, 1 +moa:vid700, 1 +moa:vid701, 1 +moa:vid704, 1 +moa:vid706, 1 +moa:vid708, 1 +moa:vid709, 1 +moa:vid710, 1 +moa:vid719, 1 +moa:vid720, 1 +moa:vid721, 1 +moa:vid722, 1 +moa:vid723, 1 +moa:vid724, 1 +moa:vid725, 1 +moa:vid726, 1 +moa:vid727, 1 +moa:vid730, 1 +moa:vid732, 1 +moa:vid733, 1 +moa:vid734, 1 +moa:vid735, 1 +moa:vid742, 1 +moa:vid743, 1 +moa:vid744, 1 +moa:vid745, 1 +moa:vid746, 1 +moa:vid747, 1 +moa:vid748, 1 +moa:vid749, 1 +moa:vid750, 1 +moa:vid751, 1 +moa:vid755, 1 +moa:vid756, 1 +moa:vid757, 1 +moa:vid759, 1 +moa:vid760, 1 +moa:vid762, 1 +moa:vid766, 1 +moa:vid768, 1 +moa:vid771, 1 +moa:vid772, 1 +moa:vid773, 1 +moa:vid778, 1 +moa:vid779, 1 +moa:vid780, 1 +moa:vid781, 1 +moa:vid782, 1 +moa:vid783, 1 +moa:vid784, 1 +moa:vid786, 1 +moa:vid787, 1 +moa:vid789, 1 +moa:vid790, 1 +moa:vid791, 1 +civic:vid102, 1 +civic:vid13, 1 +civic:vid14, 1 +civic:vid29, 1 +civic:vid23, 1 +civic:vid52, 1 +civic:vid67, 1 +civic:vid119, 1 +civic:vid155, 1 +civic:vid176, 1 +civic:vid210, 1 +civic:vid236, 1 +civic:vid251, 1 +civic:vid252, 1 +civic:vid266, 1 +civic:vid278, 1 +civic:vid279, 1 +civic:vid290, 1 +civic:vid292, 1 +civic:vid302, 1 +civic:vid305, 1 +civic:vid310, 1 +civic:vid323, 1 +civic:vid326, 1 +civic:vid328, 1 +civic:vid329, 1 +civic:vid332, 1 +civic:vid337, 1 +civic:vid338, 1 +civic:vid342, 1 +civic:vid346, 1 +civic:vid348, 1 +civic:vid355, 1 +civic:vid356, 1 +civic:vid357, 1 +civic:vid358, 1 +civic:vid359, 1 +civic:vid360, 1 +civic:vid363, 1 +civic:vid364, 1 +civic:vid366, 1 +civic:vid254, 1 +civic:vid370, 1 +civic:vid179, 1 +civic:vid371, 1 +civic:vid373, 1 +civic:vid378, 1 +civic:vid379, 1 +civic:vid380, 1 +civic:vid381, 1 +civic:vid383, 1 +civic:vid384, 1 +civic:vid386, 1 +civic:vid387, 1 +civic:vid389, 1 +civic:vid392, 1 +civic:vid393, 1 +civic:vid394, 1 +civic:vid395, 1 +civic:vid397, 1 +civic:vid398, 1 +civic:vid400, 1 +civic:vid401, 1 +civic:vid404, 1 +civic:vid406, 1 +civic:vid408, 1 +civic:vid409, 1 +civic:vid412, 1 +civic:vid416, 1 +civic:vid426, 1 +civic:vid428, 1 +civic:vid461, 1 +civic:vid463, 1 +civic:vid464, 1 +civic:vid466, 1 +civic:vid473, 1 +civic:vid475, 1 +civic:vid486, 1 +civic:vid488, 1 +civic:vid492, 1 +civic:vid493, 1 +civic:vid501, 1 +civic:vid558, 1 +civic:vid508, 1 +civic:vid213, 1 +civic:vid512, 1 +civic:vid514, 1 +civic:vid522, 1 +civic:vid529, 1 +civic:vid536, 1 +civic:vid538, 1 +civic:vid550, 1 +civic:vid555, 1 +civic:vid576, 1 +civic:vid578, 1 +civic:vid559, 1 +civic:vid568, 1 +civic:vid599, 1 +civic:vid601, 1 +civic:vid603, 1 +civic:vid615, 1 +civic:vid616, 1 +civic:vid617, 1 +civic:vid619, 1 +civic:vid189, 1 +civic:vid625, 1 +civic:vid632, 1 +civic:vid638, 1 +civic:vid281, 1 +civic:vid639, 1 +civic:vid640, 1 +civic:vid646, 1 +civic:vid652, 1 +civic:vid422, 1 +civic:vid654, 1 +civic:vid655, 1 +civic:vid656, 1 +civic:vid657, 1 +civic:vid659, 1 +civic:vid435, 1 +civic:vid660, 1 +civic:vid671, 1 +civic:vid672, 1 +civic:vid673, 1 +civic:vid697, 1 +civic:vid716, 1 +civic:vid718, 1 +civic:vid720, 1 +civic:vid729, 1 +civic:vid732, 1 +civic:vid774, 1 +civic:vid779, 1 +civic:vid795, 1 +civic:vid796, 1 +civic:vid805, 1 +civic:vid818, 1 +civic:vid819, 1 +civic:vid821, 1 +civic:vid158, 1 +civic:vid855, 1 +civic:vid858, 1 +civic:vid859, 1 +civic:vid867, 1 +civic:vid868, 1 +civic:vid869, 1 +civic:vid2590, 1 +civic:vid3049, 1 +civic:vid948, 1 +civic:vid951, 1 +civic:vid2620, 1 +civic:vid956, 1 +civic:vid957, 1 +civic:vid2621, 1 +civic:vid943, 1 +civic:vid2622, 1 +civic:vid2643, 1 +civic:vid3274, 1 +civic:vid242, 1 +civic:vid3050, 1 +civic:vid3051, 1 +civic:vid1165, 1 +civic:vid1166, 1 +civic:vid1167, 1 +civic:vid1214, 1 +civic:vid1258, 1 +civic:vid1259, 1 +civic:vid1273, 1 +civic:vid1274, 1 +civic:vid1276, 1 +civic:vid3224, 1 +civic:vid1280, 1 +civic:vid1281, 1 +civic:vid1282, 1 +civic:vid1309, 1 +civic:vid1310, 1 +civic:vid1314, 1 +civic:vid1318, 1 +civic:vid1320, 1 +civic:vid891, 1 +civic:vid2651, 1 +civic:vid1433, 1 +civic:vid1511, 1 +civic:vid1513, 1 +civic:vid1514, 1 +civic:vid1518, 1 +civic:vid497, 1 +civic:vid1539, 1 +civic:vid1540, 1 +civic:vid1541, 1 +civic:vid1542, 1 +civic:vid1543, 1 +civic:vid1546, 1 +civic:vid1547, 1 +civic:vid1549, 1 +civic:vid1555, 1 +civic:vid1557, 1 +civic:vid1566, 1 +civic:vid1580, 1 +civic:vid1581, 1 +civic:vid1590, 1 +civic:vid1591, 1 +civic:vid1592, 1 +civic:vid1593, 1 +civic:vid1628, 1 +civic:vid1635, 1 +civic:vid1638, 1 +civic:vid1643, 1 +civic:vid1645, 1 +civic:vid1654, 1 +civic:vid589, 1 +civic:vid1662, 1 +civic:vid1663, 1 +civic:vid1664, 1 +civic:vid1667, 1 +civic:vid1668, 1 +civic:vid1672, 1 +civic:vid1676, 1 +civic:vid1677, 1 +civic:vid1681, 1 +civic:vid1682, 1 +civic:vid1684, 1 +civic:vid1685, 1 +civic:vid1689, 1 +civic:vid1765, 1 +civic:vid1879, 1 +civic:vid1982, 1 +civic:vid1983, 1 +civic:vid1261, 1 +civic:vid2159, 1 +civic:vid2163, 1 +civic:vid2164, 1 +civic:vid2172, 1 +civic:vid2179, 1 +civic:vid2207, 1 +civic:vid2210, 1 +civic:vid218, 1 +civic:vid2212, 1 +civic:vid2213, 1 +civic:vid2217, 1 +civic:vid2218, 1 +civic:vid2219, 1 +civic:vid2221, 1 +civic:vid2225, 1 +civic:vid2226, 1 +civic:vid2227, 1 +civic:vid2228, 1 +civic:vid2229, 1 +civic:vid2230, 1 +civic:vid2231, 1 +civic:vid2232, 1 +civic:vid2234, 1 +civic:vid2235, 1 +civic:vid2236, 1 +civic:vid2239, 1 +civic:vid2258, 1 +civic:vid2259, 1 +civic:vid2287, 1 +civic:vid1278, 1 +civic:vid2334, 1 +civic:vid2343, 1 +civic:vid2358, 1 +civic:vid2359, 1 +civic:vid2360, 1 +civic:vid2362, 1 +civic:vid2363, 1 +civic:vid2366, 1 +civic:vid349, 1 +civic:vid2385, 1 +civic:vid2386, 1 +civic:vid2390, 1 +civic:vid2391, 1 +civic:vid2393, 1 +civic:vid2394, 1 +civic:vid2395, 1 +civic:vid2396, 1 +civic:vid2401, 1 +civic:vid2414, 1 +civic:vid2550, 1 +civic:vid2564, 1 +civic:vid362, 1 +civic:vid220, 1 +civic:vid3214, 1 +civic:vid2582, 1 +civic:vid2584, 1 +civic:vid719, 1 +civic:vid2587, 1 +civic:vid2598, 1 +civic:vid2632, 1 +civic:vid2635, 1 +civic:vid2637, 1 +civic:vid2649, 1 +civic:vid2657, 1 +civic:vid2678, 1 +civic:vid2661, 1 +civic:vid2695, 1 +civic:vid2696, 1 +civic:vid2706, 1 +civic:vid2707, 1 +civic:vid2708, 1 +civic:vid2717, 1 +civic:vid2738, 1 +civic:vid569, 1 +civic:vid2743, 1 +civic:vid2746, 1 +civic:vid2747, 1 +civic:vid2750, 1 +civic:vid2754, 1 +civic:vid2758, 1 +civic:vid2762, 1 +civic:vid2764, 1 +civic:vid2769, 1 +civic:vid2770, 1 +civic:vid2772, 1 +civic:vid2783, 1 +civic:vid2784, 1 +civic:vid2785, 1 +civic:vid2807, 1 +civic:vid2809, 1 +civic:vid2810, 1 +civic:vid2812, 1 +civic:vid2813, 1 +civic:vid2814, 1 +civic:vid2416, 1 +civic:vid2815, 1 +civic:vid2816, 1 +civic:vid2819, 1 +civic:vid2820, 1 +civic:vid2821, 1 +civic:vid2822, 1 +civic:vid2824, 1 +civic:vid2825, 1 +civic:vid2827, 1 +civic:vid2833, 1 +civic:vid2854, 1 +civic:vid2856, 1 +civic:vid184, 1 +civic:vid2883, 1 +civic:vid2885, 1 +civic:vid2886, 1 +civic:vid2887, 1 +civic:vid2888, 1 +civic:vid2889, 1 +civic:vid2891, 1 +civic:vid2894, 1 +civic:vid2903, 1 +civic:vid2907, 1 +civic:vid2408, 1 +civic:vid2912, 1 +civic:vid2914, 1 +civic:vid3054, 1 +civic:vid2948, 1 +civic:vid2949, 1 +civic:vid2950, 1 +civic:vid2951, 1 +civic:vid2952, 1 +civic:vid2953, 1 +civic:vid2954, 1 +civic:vid2904, 1 +civic:vid2970, 1 +civic:vid2971, 1 +civic:vid2972, 1 +civic:vid2973, 1 +civic:vid3006, 1 +civic:vid376, 1 +civic:vid3009, 1 +civic:vid3014, 1 +civic:vid3024, 1 +civic:vid3025, 1 +civic:vid3055, 1 +civic:vid3110, 1 +civic:vid3112, 1 +civic:vid3113, 1 +civic:vid2663, 1 +civic:vid3131, 1 +civic:vid3200, 1 +civic:vid3204, 1 +civic:vid3207, 1 +civic:vid3208, 1 +civic:vid3209, 1 +civic:vid3219, 1 +civic:vid3220, 1 +civic:vid3221, 1 +civic:vid3081, 1 +civic:vid591, 1 +civic:vid3277, 1 +civic:vid3279, 1 +civic:vid3268, 1 +civic:vid3304, 1 +civic:vid3232, 1 diff --git a/analysis/graph/unsupported_variants_hgvs_counts.txt b/analysis/graph/unsupported_variants_hgvs_counts.txt new file mode 100644 index 00000000..dccf9a56 --- /dev/null +++ b/analysis/graph/unsupported_variants_hgvs_counts.txt @@ -0,0 +1,8 @@ +NP_001333827.1:p.Leu747_Thr751delinsPro, 6 +NP_000213.1:p.Val560del, 5 +NP_000213.1:p.Asp579del, 4 +NP_000213.1:p.Lys550_Lys558del, 3 +NP_000213.1:p.Val559_Val560del, 3 +NP_000213.1:p.Val559del, 2 +NP_000213.1:p.Val555_Gln556del, 1 +NP_005219.2:p.Glu746_Thr751delinsValAla, 1 diff --git a/analysis/moa/examples/harvester/moa_harvester_example.py b/analysis/moa/examples/harvester/moa_harvester_example.py index 0465c8ac..6e9fa59a 100644 --- a/analysis/moa/examples/harvester/moa_harvester_example.py +++ b/analysis/moa/examples/harvester/moa_harvester_example.py @@ -1,7 +1,7 @@ """Create an example json file for MOAlmanac Harvester.""" import json -from metakb import PROJECT_ROOT, APP_ROOT +from metakb import APP_ROOT, PROJECT_ROOT from metakb.harvesters import MOAHarvester @@ -9,29 +9,28 @@ def create_assertion_examples(data): """Create five MOAlmanac assertion examples.""" assertions = [] for i in [0, 69, 599, 699, 759]: - if data['assertions'][i]['source_ids']: - assertions.append(data['assertions'][i]) + if data["assertions"][i]["source_ids"]: + assertions.append(data["assertions"][i]) for assertion in assertions: - source_id = assertion['source_ids'] - for s in data['sources']: - if s['id'] == source_id: + source_id = assertion["source_ids"] + for s in data["sources"]: + if s["id"] == source_id: source = s break - feature_id = assertion['variant']['id'] - for v in data['variants']: - if v['id'] == feature_id: + feature_id = assertion["variant"]["id"] + for v in data["variants"]: + if v["id"] == feature_id: variant = v break - with open(f"{PROJECT_ROOT}/analysis/moa/examples/harvester/" - f"assertion {assertion['id']}.json", 'w+') as f: - example = { - 'ASSERTIONS': assertion, - 'SOURCES': source, - 'VARIANTS': variant - } + with open( + f"{PROJECT_ROOT}/analysis/moa/examples/harvester/" + f"assertion {assertion['id']}.json", + "w+", + ) as f: + example = {"ASSERTIONS": assertion, "SOURCES": source, "VARIANTS": variant} json.dump(example, f, indent=4) print(f"Created JSON for evidence: assertion {assertion['id']}") @@ -44,24 +43,29 @@ def create_variant_examples(data): """ variants_ids = [1, 147, 551, 701] variants = [] - for i in range(len(data['variants'])): - if data['variants'][i]['id'] in variants_ids: - variants.append(data['variants'][i]) + for i in range(len(data["variants"])): + if data["variants"][i]["id"] in variants_ids: + variants.append(data["variants"][i]) for variant in variants: - with open(f"{PROJECT_ROOT}/analysis/moa/examples/harvester/" - f"{variant['feature'].lower()}.json", 'w+') as f: + with open( + f"{PROJECT_ROOT}/analysis/moa/examples/harvester/" + f"{variant['feature'].lower()}.json", + "w+", + ) as f: f.write(json.dumps(variant, indent=4)) print(f"Created JSON for variant: {variant['feature']}") f.close() -if __name__ == '__main__': +if __name__ == "__main__": moa = MOAHarvester() moa.harvest() - latest = sorted((APP_ROOT / "data" / "moa" / "harvester").glob("moa_harvester_*.json"))[-1] # noqa: E501 + latest = sorted( + (APP_ROOT / "data" / "moa" / "harvester").glob("moa_harvester_*.json") + )[-1] with open(latest, "r") as f: moa_data = json.load(f) - moa_ex_dir = PROJECT_ROOT / 'analysis' / 'moa' / 'examples' + moa_ex_dir = PROJECT_ROOT / "analysis" / "moa" / "examples" moa_ex_dir.mkdir(exist_ok=True, parents=True) create_assertion_examples(moa_data) create_variant_examples(moa_data) diff --git a/analysis/moa/examples/transform/moa_transform_example.py b/analysis/moa/examples/transform/moa_transform_example.py index 73e3a01b..507e6c31 100644 --- a/analysis/moa/examples/transform/moa_transform_example.py +++ b/analysis/moa/examples/transform/moa_transform_example.py @@ -1,13 +1,13 @@ """Create an example json file for MOA Transform.""" import json -from metakb import PROJECT_ROOT, APP_ROOT +from metakb import APP_ROOT, PROJECT_ROOT from metakb.transform import MOATransform def create_moa_example(moa_data): """Create MOA transform examples from list of evidence items.""" - assertion_id = ['moa.assertion:71', 'moa.assertion:188'] + assertion_id = ["moa.assertion:71", "moa.assertion:188"] ex = {} proposition = None var_des = None @@ -18,55 +18,60 @@ def create_moa_example(moa_data): doc = None for asst_id in assertion_id: - for statement in moa_data['statements']: - if statement['id'] == asst_id: - ex['statements'] = [statement] - proposition = statement['proposition'] - var_des = statement['variation_descriptor'] - t_des = statement['therapy_descriptor'] - d_des = statement['disease_descriptor'] - method = statement['method'] - doc = statement['supported_by'][0] + for statement in moa_data["statements"]: + if statement["id"] == asst_id: + ex["statements"] = [statement] + proposition = statement["proposition"] + var_des = statement["variation_descriptor"] + t_des = statement["therapy_descriptor"] + d_des = statement["disease_descriptor"] + method = statement["method"] + doc = statement["supported_by"][0] - for p in moa_data['propositions']: - if p['id'] == proposition: - ex['propositions'] = [p] + for p in moa_data["propositions"]: + if p["id"] == proposition: + ex["propositions"] = [p] - for v in moa_data['variation_descriptors']: - if v['id'] == var_des: - ex['variation_descriptors'] = [v] - g_des = v['gene_context'] + for v in moa_data["variation_descriptors"]: + if v["id"] == var_des: + ex["variation_descriptors"] = [v] + g_des = v["gene_context"] - for g in moa_data['gene_descriptors']: - if g['id'] == g_des: - ex['gene_descriptors'] = [g] + for g in moa_data["gene_descriptors"]: + if g["id"] == g_des: + ex["gene_descriptors"] = [g] - for t in moa_data['therapy_descriptors']: - if t['id'] == t_des: - ex['therapy_descriptors'] = [t] + for t in moa_data["therapy_descriptors"]: + if t["id"] == t_des: + ex["therapy_descriptors"] = [t] - for d in moa_data['disease_descriptors']: - if d['id'] == d_des: - ex['disease_descriptors'] = [d] + for d in moa_data["disease_descriptors"]: + if d["id"] == d_des: + ex["disease_descriptors"] = [d] - for m in moa_data['methods']: - if m['id'] == method: - ex['methods'] = [m] + for m in moa_data["methods"]: + if m["id"] == method: + ex["methods"] = [m] - for d in moa_data['documents']: - if d['id'] == doc: - ex['documents'] = [d] + for d in moa_data["documents"]: + if d["id"] == doc: + ex["documents"] = [d] - with open(f"{PROJECT_ROOT}/analysis/moa/examples/transform/" - f"{ex['statements'][0]['id']}.json", 'w+') as f: + with open( + f"{PROJECT_ROOT}/analysis/moa/examples/transform/" + f"{ex['statements'][0]['id']}.json", + "w+", + ) as f: json.dump(ex, f, indent=4) -if __name__ == '__main__': +if __name__ == "__main__": moa = MOATransform() moa.transform() moa.create_json() - latest = sorted((APP_ROOT / "data" / "moa" / "transform").glob("moa_cdm_*.json"))[-1] # noqa: E501 + latest = sorted((APP_ROOT / "data" / "moa" / "transform").glob("moa_cdm_*.json"))[ + -1 + ] with open(latest, "r") as f: moa_data = json.load(f) create_moa_example(moa_data) diff --git a/analysis/pharmgkb/examples/harvester/pharmgkb_harvester.py b/analysis/pharmgkb/examples/harvester/pharmgkb_harvester.py new file mode 100644 index 00000000..b95cf0b6 --- /dev/null +++ b/analysis/pharmgkb/examples/harvester/pharmgkb_harvester.py @@ -0,0 +1,7 @@ +"""Grab some example data for pharmgkb.""" + +from metakb.harvesters import PharmGKBHarvester + +if __name__ == "__main__": + ph = PharmGKBHarvester() + ph.harvest() diff --git a/analysis/pharmgkb/initial exploratory analysis.ipynb b/analysis/pharmgkb/initial exploratory analysis.ipynb new file mode 100644 index 00000000..2ac2b9aa --- /dev/null +++ b/analysis/pharmgkb/initial exploratory analysis.ipynb @@ -0,0 +1,151 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e5519a4a", + "metadata": {}, + "source": [ + "# PharmGKB Data Exploration" + ] + }, + { + "cell_type": "markdown", + "id": "710c6bd6", + "metadata": {}, + "source": [ + "## Files\n", + "\n", + "Relationships:\n", + " * variants.tsv: key variant IDs to dbSNP IDs and HGVS strings\n", + "\n", + "\n", + "Clinical annotations:\n", + " * clinical_annotations.tsv:\n", + " * clinical_ann_alleles.tsv:\n", + " * clinical_ann_evidence.tsv:\n", + "\n", + "\n", + "Variant annotations:\n", + "\n", + "* var_pheno_ann.tsv: Contains associations in which the variant affects a phenotype, with or without drug information.\n", + "* var_drug_ann.tsv: Contains associations in which the variant affects a drug dose, response, metabolism, etc\n", + "* var_fa_ann.tsv: Contains in vitro and functional analysis-type associations.\n", + "* study_parameters.tsv: Contains information about the study population size, biogeographical group and statistics for the variant annotations; this file is cross-referenced against the 3 variant annotation files.\n", + "* CREATED_xxxx-xx-xx.txt: This file indicates the date that all files in this group were created from the database.\n", + "\n", + "\n", + "## Data structures\n", + "\n", + "Variant annotation (\"*PharmGKB variant annotations report the association between a variant (e.g. SNP, indel, repeat, haplotype) and a drug phenotype from a single publication*\"):\n", + "\n", + "Clinical annotation:\n", + " * annotation ID\n", + " * variant/alleles\n", + " * gene\n", + " * Evidence level, level modifier, level override\n", + " * Evidence PMID\n", + " * Score\n", + " * Phenotype category {toxicity, efficacy, dosage, metabolism/PK, PD, other}\n", + " * Drug(s)\n", + " * Phenotype(s)\n", + " * Specialty population\n", + "\n", + "Variant:\n", + " * pharmGKB ID\n", + " * dbSNP ID\n", + " * Alleles?\n", + "\n", + "\n", + "\n", + "## Outstanding questions\n", + "\n", + "License (share alike) okay?\n", + "\n", + "How to properly VRSify dbSNP IDs?" + ] + }, + { + "cell_type": "markdown", + "id": "172ca28c", + "metadata": {}, + "source": [ + "### Extracting direction" + ] + }, + { + "cell_type": "markdown", + "id": "57135882", + "metadata": {}, + "source": [ + "### DrugLabel\n", + "\n", + "\"PharmGKB annotates drug labels containing pharmacogenetic information approved by the US Food and Drug Administration (FDA), European Medicines Agency (EMA), Swiss Agency of Therapeutic Products (Swissmedic), Pharmaceuticals and Medical Devices Agency, Japan (PMDA) and Health Canada (Santé Canada) (HCSC).\"\n", + "\n", + "**Sources**:\n", + "\n", + "* US FDA: \"Information is gathered from the FDA's \"Table of Pharmacogenomic Biomarkers in Drug Labels\" and from FDA-approved labels brought to our attention\"\n", + " * https://www.fda.gov/drugs/science-research-drugs/table-pharmacogenomic-biomarkers-drug-labeling\n", + " * \"The Biomarker badge On FDA Biomarker List refers to a drug label that is found on the FDA's Table of Pharmacogenomic Biomarkers in Drug Labels.\"\n", + "* Swissmedic: \"The Swissmedic drug label annotations are sourced through a collaboration with the Pharmaceutical Care Research Group (PCRG), Department of Pharmaceutical Sciences, University of Basel, who provided a translation of the pharmacogenetic information contained in the Swissmedic drug labels, screened by natural language processing (NLP).\"\n", + "* EMA: \"European Public Assessment Reports (EPARs) that contain PGx information were identified from [Article:24433361] and also by searching for drugs for which we have PGx-containing FDA drug labels.\"\n", + " * See https://www.pharmgkb.org/literature/15069531\n", + "* PMDA: \"Unless otherwise stated, Japanese drug label annotations were translated through a collaboration with the Japanese Society of Pharmacogenomics and Silicon Valley Tech KK. PMDA package inserts were selected to be examined for PGx information by searching for drugs for which we had PGx-containing FDA, EMA or HCSC labels at the time.\"\n", + "* HCSC: \"Canadian drug labels (referred to as product monographs) are sourced from Health Canada's Drug Product Database (DPD). Product monographs that contain PGx information are identified by searching for drugs for which we have PGx-containing FDA labels.\"\n", + " * http://www.hc-sc.gc.ca/dhp-mps/prodpharma/databasdon/index-eng.php" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "15c3cb88", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "557ebcd3", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8d14b482", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6800b79c", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "metakb", + "language": "python", + "name": "metakb" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/analysis/pharmgkb/notes.md b/analysis/pharmgkb/notes.md new file mode 100644 index 00000000..2c20e972 --- /dev/null +++ b/analysis/pharmgkb/notes.md @@ -0,0 +1,41 @@ +# PharmGKB notes + +## Files + +Relationships: + + +Clinical annotations: + * clinical_annotations.tsv: + * clinical_ann_alleles.tsv: + * clinical_ann_evidence.tsv: + + +Variant annotations: + + +## Data structures + +Clinical annotation: + * annotation ID + * variant/alleles + * gene + * Evidence level, level modifier, level override + * Evidence PMID + * Score + * Phenotype category {toxicity, efficacy, dosage, metabolism/PK, PD, other} + * Drug(s) + * Phenotype(s) + * Specialty population + +Variant: + * pharmGKB ID + * dbSNP ID + * Alleles? + + + +## Misc questions + +License (share alike) okay? +How to dereference dbSNP IDs? diff --git a/metakb/__init__.py b/metakb/__init__.py index 46186a18..147aa86e 100644 --- a/metakb/__init__.py +++ b/metakb/__init__.py @@ -1,28 +1,28 @@ """The MetaKB package.""" -from pathlib import Path import logging from os import environ +from pathlib import Path APP_ROOT = Path(__file__).resolve().parents[0] PROJECT_ROOT = Path(__file__).resolve().parents[1] -if 'METAKB_NORM_EB_PROD' in environ: +if "METAKB_NORM_EB_PROD" in environ: LOG_FN = "/tmp/metakb.log" else: LOG_FN = "metakb.log" logging.basicConfig( - filename=LOG_FN, - format='[%(asctime)s] - %(name)s - %(levelname)s : %(message)s') -logger = logging.getLogger('metakb') + filename=LOG_FN, format="[%(asctime)s] - %(name)s - %(levelname)s : %(message)s" +) +logger = logging.getLogger("metakb") logger.setLevel(logging.DEBUG) logging.getLogger("boto3").setLevel(logging.INFO) logging.getLogger("botocore").setLevel(logging.INFO) logging.getLogger("urllib3").setLevel(logging.INFO) logging.getLogger("python_jsonschema_objects").setLevel(logging.INFO) logging.getLogger("hgvs.parser").setLevel(logging.INFO) -logging.getLogger("biocommons.seqrepo.seqaliasdb.seqaliasdb").setLevel(logging.INFO) # noqa: E501 -logging.getLogger("biocommons.seqrepo.fastadir.fastadir").setLevel(logging.INFO) # noqa: E501 +logging.getLogger("biocommons.seqrepo.seqaliasdb.seqaliasdb").setLevel(logging.INFO) +logging.getLogger("biocommons.seqrepo.fastadir.fastadir").setLevel(logging.INFO) logging.getLogger("requests_cache.patcher").setLevel(logging.INFO) logging.getLogger("bioregistry.resource_manager").setLevel(logging.INFO) logging.getLogger("blib2to3.pgen2.driver").setLevel(logging.INFO) @@ -30,7 +30,7 @@ logging.getLogger("asyncio").setLevel(logging.INFO) logger.handlers = [] -if 'METAKB_NORM_EB_PROD' in environ: +if "METAKB_NORM_EB_PROD" in environ: ch = logging.StreamHandler() ch.setLevel(logging.DEBUG) logger.addHandler(ch) diff --git a/metakb/cli.py b/metakb/cli.py index 3ce9de83..9500cf2c 100644 --- a/metakb/cli.py +++ b/metakb/cli.py @@ -1,42 +1,40 @@ -""" -Provide CLI utility for performing data collection, transformation, and upload +"""Provide CLI utility for performing data collection, transformation, and upload to graph datastore. """ -from timeit import default_timer as timer -from os import environ import logging -from typing import Optional -from pathlib import Path import re import tempfile +from os import environ +from pathlib import Path +from timeit import default_timer as timer +from typing import Optional from zipfile import ZipFile import asyncclick as click +import boto3 +import gene.cli as GeneCLI # noqa: N812 +from boto3.exceptions import ResourceLoadException +from botocore.config import Config +from disease.cli import CLI as DiseaseCLI # noqa: N811 from disease.database import Database as DiseaseDatabase from disease.schemas import SourceName as DiseaseSources -from disease.cli import CLI as DiseaseCLI -from therapy.database import Database as TherapyDatabase -from therapy.schemas import SourceName as TherapySources -from therapy.cli import CLI as TherapyCLI from gene.database.dynamodb import DynamoDbDatabase as GeneDatabase from gene.schemas import SourceName as GeneSources -import gene.cli as GeneCLI -import boto3 -from boto3.exceptions import ResourceLoadException -from botocore.config import Config +from therapy.cli import CLI as TherapyCLI # noqa: N811 +from therapy.database import Database as TherapyDatabase +from therapy.schemas import SourceName as TherapySources from metakb import APP_ROOT from metakb.database import Graph +from metakb.harvesters import CIViCHarvester, Harvester, MOAHarvester, OncoKBHarvester from metakb.schemas import SourceName -from metakb.harvesters import Harvester, CIViCHarvester, MOAHarvester, OncoKBHarvester -from metakb.transform import Transform, CIViCTransform, MOATransform, OncoKBTransform +from metakb.transform import CIViCTransform, MOATransform, OncoKBTransform, Transform - -logger = logging.getLogger('metakb.cli') +logger = logging.getLogger("metakb.cli") logger.setLevel(logging.DEBUG) -def echo_info(msg: str): +def echo_info(msg: str) -> None: """Log (as INFO) and echo given message. :param str msg: message to emit """ @@ -50,59 +48,72 @@ class CLI: @staticmethod @click.command() @click.option( - '--db_url', - help=('URL endpoint for the application Neo4j database. Can also be ' - 'provided via environment variable METAKB_DB_URL.') + "--db_url", + help=( + "URL endpoint for the application Neo4j database. Can also be " + "provided via environment variable METAKB_DB_URL." + ), ) @click.option( - '--db_username', - help=('Username to provide to application database. Can also be ' - 'provided via environment variable METAKB_DB_USERNAME.') + "--db_username", + help=( + "Username to provide to application database. Can also be " + "provided via environment variable METAKB_DB_USERNAME." + ), ) @click.option( - '--db_password', - help=('Password to provide to application database. Can also be ' - 'provided via environment variable METAKB_DB_PASSWORD.') + "--db_password", + help=( + "Password to provide to application database. Can also be " + "provided via environment variable METAKB_DB_PASSWORD." + ), ) @click.option( - '--load_normalizers_db', - '-i', + "--load_normalizers_db", + "-i", is_flag=True, default=False, - help='Check normalizers database and load data if necessary.' + help="Check normalizers database and load data if necessary.", ) @click.option( - '--force_load_normalizers_db', - '-f', + "--force_load_normalizers_db", + "-f", is_flag=True, default=False, - help=('Load all normalizers data into database. Overrides ' - '--load_normalizers_db if both are selected.') + help=( + "Load all normalizers data into database. Overrides " + "--load_normalizers_db if both are selected." + ), ) @click.option( - '--normalizers_db_url', - default='http://localhost:8000', - help=('URL endpoint of normalizers DynamoDB database. Set to ' - '`http://localhost:8000` by default.') + "--normalizers_db_url", + default="http://localhost:8000", + help=( + "URL endpoint of normalizers DynamoDB database. Set to " + "`http://localhost:8000` by default." + ), ) @click.option( "--load_latest_cdms", "-l", is_flag=True, default=False, - help=("Clear MetaKB database and load most recent available source " - "CDM files. Does not run harvest and transform methods to " - "generate new CDM files. Exclusive with --load_target_cdm and " - "--load_latest_s3_cdms.") + help=( + "Clear MetaKB database and load most recent available source " + "CDM files. Does not run harvest and transform methods to " + "generate new CDM files. Exclusive with --load_target_cdm and " + "--load_latest_s3_cdms." + ), ) @click.option( "--load_target_cdm", "-t", - type=click.Path(exists=True, dir_okay=False, readable=True, - path_type=Path), + type=click.Path(exists=True, dir_okay=False, readable=True, path_type=Path), required=False, - help=("Load transformed CDM file at specified path. Exclusive with " - "--load_latest_cdms and --load_latest_s3_cdms.") + help=( + "Load transformed CDM file at specified path. Exclusive with " + "--load_latest_cdms and --load_latest_s3_cdms." + ), ) @click.option( "--load_latest_s3_cdms", @@ -110,10 +121,12 @@ class CLI: is_flag=True, default=False, required=False, - help=("Clear MetaKB database, retrieve most recent data available " - "from VICC S3 bucket, and load the database with retrieved " - "data. Will not download OncoKB transformed data. Exclusive with" - " --load_latest_cdms and load_target_cdm.") + help=( + "Clear MetaKB database, retrieve most recent data available " + "from VICC S3 bucket, and load the database with retrieved " + "data. Will not download OncoKB transformed data. Exclusive with" + " --load_latest_cdms and load_target_cdm." + ), ) @click.option( "--update_cached", @@ -121,8 +134,10 @@ class CLI: is_flag=True, default=False, required=False, - help=("`True` if civicpy cache should be updated. Note this will take serveral" - "minutes. `False` if local cache should be used") + help=( + "`True` if civicpy cache should be updated. Note this will take serveral" + "minutes. `False` if local cache should be used" + ), ) @click.option( "--update_from_remote", @@ -130,43 +145,57 @@ class CLI: is_flag=True, default=False, required=False, - help=("If set to `True`, civicpy.update_cache will first download the remote " - "cache designated by REMOTE_CACHE_URL, store it to LOCAL_CACHE_PATH, " - "and then load the downloaded cache into memory.") + help=( + "If set to `True`, civicpy.update_cache will first download the remote " + "cache designated by REMOTE_CACHE_URL, store it to LOCAL_CACHE_PATH, " + "and then load the downloaded cache into memory." + ), ) @click.option( "--oncokb_variants_by_protein_change_path", "-k", required=False, type=click.Path(exists=True, dir_okay=False, readable=True, path_type=Path), - help=("Path to CSV file containing header row with `hugo_symbol` and " - "`protein_change` and associated rows containing protein variants you " - "wish to harvest using a comma as the delimiter. Not required if using " - "`--load_latest_cdms`, `--load_target_cdm`, or `--load_latest_s3_cdms`") + help=( + "Path to CSV file containing header row with `hugo_symbol` and " + "`protein_change` and associated rows containing protein variants you " + "wish to harvest using a comma as the delimiter. Not required if using " + "`--load_latest_cdms`, `--load_target_cdm`, or `--load_latest_s3_cdms`" + ), ) async def update_metakb_db( - db_url: str, db_username: str, db_password: str, - load_normalizers_db: bool, force_load_normalizers_db: bool, - normalizers_db_url: str, load_latest_cdms: bool, - load_target_cdm: Optional[Path], load_latest_s3_cdms: bool, - update_cached: bool, update_from_remote: bool, - oncokb_variants_by_protein_change_path: Optional[Path] - ): + db_url: str, + db_username: str, + db_password: str, + load_normalizers_db: bool, + force_load_normalizers_db: bool, + normalizers_db_url: str, + load_latest_cdms: bool, + load_target_cdm: Optional[Path], + load_latest_s3_cdms: bool, + update_cached: bool, + update_from_remote: bool, + oncokb_variants_by_protein_change_path: Optional[Path], + ) -> None: """Execute data harvest and transformation from resources and upload to graph datastore. """ - if sum([load_latest_cdms, bool(load_target_cdm), - load_latest_s3_cdms]) > 1: - CLI()._help_msg("Error: Can only use one of `--load_latest_cdms`, " - "`--load_target_cdm`, `--load_latest_s3_cdms`.") + if sum([load_latest_cdms, bool(load_target_cdm), load_latest_s3_cdms]) > 1: + CLI()._help_msg( + "Error: Can only use one of `--load_latest_cdms`, " + "`--load_target_cdm`, `--load_latest_s3_cdms`." + ) - db_url = CLI()._check_db_param(db_url, 'URL') - db_username = CLI()._check_db_param(db_username, 'username') - db_password = CLI()._check_db_param(db_password, 'password') + db_url = CLI()._check_db_param(db_url, "URL") + db_username = CLI()._check_db_param(db_username, "username") + db_password = CLI()._check_db_param(db_password, "password") if normalizers_db_url: - for env_var_name in ['GENE_NORM_DB_URL', 'THERAPY_NORM_DB_URL', - 'DISEASE_NORM_DB_URL']: + for env_var_name in [ + "GENE_NORM_DB_URL", + "THERAPY_NORM_DB_URL", + "DISEASE_NORM_DB_URL", + ]: environ[env_var_name] = normalizers_db_url if not any([load_latest_cdms, load_target_cdm, load_latest_s3_cdms]): @@ -175,10 +204,14 @@ async def update_metakb_db( if not oncokb_variants_by_protein_change_path: CLI()._help_msg( - "Error: Must provide `--oncokb_variants_by_protein_change_path`") + "Error: Must provide `--oncokb_variants_by_protein_change_path`" + ) - CLI()._harvest_sources(update_cached, update_from_remote, - oncokb_variants_by_protein_change_path) + CLI()._harvest_sources( + update_cached, + update_from_remote, + oncokb_variants_by_protein_change_path, + ) await CLI()._transform_sources() # Load neo4j database @@ -193,8 +226,7 @@ async def update_metakb_db( if load_latest_s3_cdms: version = CLI()._retrieve_s3_cdms() g.clear() - for src in sorted({v.value for v - in SourceName.__members__.values()}): + for src in sorted({v.value for v in SourceName.__members__.values()}): if version is not None: pattern = f"{src}_cdm_{version}.json" else: @@ -203,18 +235,16 @@ async def update_metakb_db( try: path = sorted(globbed)[-1] except IndexError: - raise FileNotFoundError(f"No valid transform file found " - f"matching pattern: {pattern}") + raise FileNotFoundError( + f"No valid transform file found " f"matching pattern: {pattern}" + ) click.echo(f"\tLoading {src} CDM from path...: {path}") g.load_from_json(path) g.close() end = timer() - echo_info( - f"Successfully loaded neo4j database in {(end-start):.5f} s\n" - ) + echo_info(f"Successfully loaded neo4j database in {(end-start):.5f} s\n") - s3_cdm_pattern = re.compile( - r"cdm/20[23]\d[01]\d[0123]\d/(.*)_cdm_(.*).json.zip") + s3_cdm_pattern = re.compile(r"cdm/20[23]\d[01]\d[0123]\d/(.*)_cdm_(.*).json.zip") def _retrieve_s3_cdms(self) -> str: """Retrieve most recent CDM files from VICC S3 bucket. Expects to find @@ -230,11 +260,9 @@ def _retrieve_s3_cdms(self) -> str: if not s3: raise ResourceLoadException("Unable to initiate AWS S3 Resource") bucket = sorted( - list( - s3.Bucket("vicc-metakb").objects.filter(Prefix="cdm").all() - ), + list(s3.Bucket("vicc-metakb").objects.filter(Prefix="cdm").all()), key=lambda f: f.key, - reverse=True + reverse=True, ) newest_version: Optional[str] = None for file in bucket: @@ -257,8 +285,10 @@ def _retrieve_s3_cdms(self) -> str: cdm_zip.extract(f"{source}_cdm_{newest_version}.json", cdm_dir) if newest_version is None: - raise FileNotFoundError("Unable to locate files matching expected " - "resource pattern in VICC s3 bucket") + raise FileNotFoundError( + "Unable to locate files matching expected " + "resource pattern in VICC s3 bucket" + ) echo_info(f"Retrieved CDM files dated {newest_version}") return newest_version @@ -272,7 +302,7 @@ def _harvest_sources( harvester_sources = { SourceName.CIVIC.value: CIViCHarvester, SourceName.MOA.value: MOAHarvester, - SourceName.ONCOKB.value: OncoKBHarvester + SourceName.ONCOKB.value: OncoKBHarvester, } total_start = timer() for source_str, source_class in harvester_sources.items(): @@ -297,10 +327,9 @@ def _harvest_sources( end = timer() if not source_successful: - echo_info(f'{source_str} harvest failed.') + echo_info(f"{source_str} harvest failed.") click.get_current_context().exit() - echo_info( - f"{source_str} harvest finished in {(end - start):.5f} s") + echo_info(f"{source_str} harvest finished in {(end - start):.5f} s") total_end = timer() echo_info( f"Successfully harvested all sources in " @@ -315,7 +344,7 @@ async def _transform_sources() -> None: transform_sources = { SourceName.CIVIC.value: CIViCTransform, SourceName.MOA.value: MOATransform, - SourceName.ONCOKB.value: OncoKBTransform + SourceName.ONCOKB.value: OncoKBTransform, } total_start = timer() for src_str, src_name in transform_sources.items(): @@ -324,8 +353,7 @@ async def _transform_sources() -> None: source: Transform = src_name() await source.transform() end = timer() - echo_info( - f"{src_str} transform finished in {(end - start):.5f} s.") + echo_info(f"{src_str} transform finished in {(end - start):.5f} s.") source.create_json() total_end = timer() echo_info( @@ -343,18 +371,21 @@ def _load_normalizers_db(self, load_normalizer_db): load_disease = load_therapy = load_gene = True else: load_disease = self._check_normalizer( - DiseaseDatabase(), {src.value for src in DiseaseSources}) + DiseaseDatabase(), {src.value for src in DiseaseSources} + ) load_therapy = self._check_normalizer( - TherapyDatabase(), {src for src in TherapySources}) + TherapyDatabase(), {src for src in TherapySources} + ) load_gene = self._check_normalizer( - GeneDatabase(), {src.value for src in GeneSources}) + GeneDatabase(), {src.value for src in GeneSources} + ) for load_source, normalizer_cli in [ - (load_disease, DiseaseCLI), (load_therapy, TherapyCLI), - (load_gene, GeneCLI) + (load_disease, DiseaseCLI), + (load_therapy, TherapyCLI), + (load_gene, GeneCLI), ]: - name = \ - str(normalizer_cli).split()[1].split('.')[0][1:].capitalize() + name = str(normalizer_cli).split()[1].split(".")[0][1:].capitalize() self._update_normalizer_db(name, load_source, normalizer_cli) echo_info("Normalizers database loaded.\n") @@ -367,10 +398,8 @@ def _check_normalizer(db, sources) -> bool: :return: `True` If normalizer needs to be loaded. `False` otherwise. """ for src in sources: - response = db.metadata.get_item( - Key={'src_name': src} - ) - if not response.get('Item'): + response = db.metadata.get_item(Key={"src_name": src}) + if not response.get("Item"): return True return False @@ -385,15 +414,14 @@ def _update_normalizer_db(name, load_normalizer, source_cli) -> None: """ if load_normalizer: try: - echo_info(f'\nLoading {name} Normalizer data...') - source_cli.update_normalizer_db( - ['--update_all', '--update_merged']) - echo_info(f'Successfully Loaded {name} Normalizer data.\n') + echo_info(f"\nLoading {name} Normalizer data...") + source_cli.update_normalizer_db(["--update_all", "--update_merged"]) + echo_info(f"Successfully Loaded {name} Normalizer data.\n") except SystemExit as e: if e.code != 0: raise e else: - echo_info(f'{name} Normalizer is already loaded.\n') + echo_info(f"{name} Normalizer is already loaded.\n") @staticmethod def _check_db_param(param: str, name: str) -> str: @@ -403,17 +431,17 @@ def _check_db_param(param: str, name: str) -> str: :return: parameter value, or exit with error message if unavailable """ if not param: - env_var_name = f'METAKB_DB_{name.upper()}' + env_var_name = f"METAKB_DB_{name.upper()}" if env_var_name in environ.keys(): return environ[env_var_name] else: # Default is local - if name == 'URL': + if name == "URL": return "bolt://localhost:7687" - elif name == 'username': - return 'neo4j' + elif name == "username": + return "neo4j" else: - return 'admin' + return "admin" else: return param @@ -431,5 +459,5 @@ def _help_msg(msg: str = ""): ctx.exit() -if __name__ == '__main__': +if __name__ == "__main__": CLI().update_metakb_db(_anyio_backend="asyncio") diff --git a/metakb/database.py b/metakb/database.py index c89664f3..5801b37d 100644 --- a/metakb/database.py +++ b/metakb/database.py @@ -1,17 +1,18 @@ """Graph database for storing harvested data.""" -from neo4j import GraphDatabase -from neo4j.exceptions import ServiceUnavailable, ConstraintError -from typing import List, Tuple, Dict, Set -import logging +import ast +import base64 import json -from pathlib import Path +import logging from os import environ +from pathlib import Path +from typing import Dict, List, Set, Tuple + import boto3 -import base64 from botocore.exceptions import ClientError -import ast +from neo4j import GraphDatabase +from neo4j.exceptions import ConstraintError, ServiceUnavailable -logger = logging.getLogger('metakb.database') +logger = logging.getLogger("metakb.database") logger.setLevel(logging.DEBUG) @@ -21,29 +22,31 @@ def _create_keys_string(entity, keys) -> str: :param Tuple keys: key names to check :return: formatted String for use in Cypher query """ - nonnull_keys = [f"{key}:${key}" - for key in keys if entity.get(key)] - keys_string = ', '.join(nonnull_keys) + nonnull_keys = [f"{key}:${key}" for key in keys if entity.get(key)] + keys_string = ", ".join(nonnull_keys) return keys_string class Graph: """Manage requests to graph datastore.""" - def __init__(self, uri: str = '', credentials: Tuple[str, str] = ('', '')): + def __init__(self, uri: str = "", credentials: Tuple[str, str] = ("", "")): """Initialize Graph driver instance. :param str uri: address of Neo4j DB :param Tuple[str, str] credentials: tuple containing username and password """ - if 'METAKB_NORM_EB_PROD' in environ: + if "METAKB_NORM_EB_PROD" in environ: secret = ast.literal_eval(self.get_secret()) uri = f"bolt://{secret['host']}:{secret['port']}" - credentials = (secret['username'], secret['password']) - elif 'METAKB_DB_URL' in environ and 'METAKB_DB_USERNAME' in environ and 'METAKB_DB_PASSWORD' in environ: # noqa: E501 - uri = environ['METAKB_DB_URL'] - credentials = (environ['METAKB_DB_USERNAME'], - environ['METAKB_DB_PASSWORD']) + credentials = (secret["username"], secret["password"]) + elif ( + "METAKB_DB_URL" in environ + and "METAKB_DB_USERNAME" in environ + and "METAKB_DB_PASSWORD" in environ + ): + uri = environ["METAKB_DB_URL"] + credentials = (environ["METAKB_DB_USERNAME"], environ["METAKB_DB_PASSWORD"]) elif not (uri and credentials[0] and credentials[1]): # Local uri = "bolt://localhost:7687" @@ -58,8 +61,10 @@ def close(self): def clear(self): """Debugging helper - wipe out DB.""" + def delete_all(tx): tx.run("MATCH (n) DETACH DELETE n;") + with self.driver.session() as session: session.write_transaction(delete_all) @@ -71,7 +76,7 @@ def load_from_json(self, src_transformed_cdm: Path): methods, and documents """ logger.info(f"Loading data from {src_transformed_cdm}") - with open(src_transformed_cdm, 'r') as f: + with open(src_transformed_cdm, "r") as f: items = json.load(f) self.add_transformed_data(items) @@ -79,10 +84,19 @@ def load_from_json(self, src_transformed_cdm: Path): def _create_constraints(tx): """Create unique property constraints for ID values""" for label in [ - 'Gene', 'Disease', 'Therapy', 'Variation', 'GeneDescriptor', - 'TherapyDescriptor', 'DiseaseDescriptor', - 'VariationDescriptor', 'VariationGroup', 'Proposition', - 'Document', 'Statement', 'Method' + "Gene", + "Disease", + "Therapy", + "Variation", + "GeneDescriptor", + "TherapyDescriptor", + "DiseaseDescriptor", + "VariationDescriptor", + "VariationGroup", + "Proposition", + "Document", + "Statement", + "Method", ]: query = ( f"CREATE CONSTRAINT {label.lower()}_id_constraint " @@ -92,8 +106,9 @@ def _create_constraints(tx): try: tx.run(query) except ServiceUnavailable as exception: - logging.error(f"Failed to generate ID property " - f"constraint for {label}.") + logging.error( + f"Failed to generate ID property " f"constraint for {label}." + ) raise exception def add_transformed_data(self, data: Dict): @@ -105,55 +120,54 @@ def add_transformed_data(self, data: Dict): added_ids = set() # Used to keep track of IDs that are in statements with self.driver.session() as session: loaded_count = 0 - for ev in data.get('statements', []): + for ev in data.get("statements", []): self._get_ids_from_statement(ev, added_ids) - for var_descr in data.get('variation_descriptors', []): - if var_descr['id'] in added_ids: - gc = var_descr['gene_context'] + for var_descr in data.get("variation_descriptors", []): + if var_descr["id"] in added_ids: + gc = var_descr["gene_context"] if gc: added_ids.add(gc) - for method in data.get('methods', []): + for method in data.get("methods", []): try: - session.write_transaction(self._add_method, method, - added_ids) + session.write_transaction(self._add_method, method, added_ids) except ConstraintError: logger.warning(f"{method['id']} exists already.") continue - for descriptor in ['therapy_descriptors', 'disease_descriptors', - 'gene_descriptors']: + for descriptor in [ + "therapy_descriptors", + "disease_descriptors", + "gene_descriptors", + ]: for d in data.get(descriptor, []): try: - session.write_transaction( - self._add_descriptor, d, added_ids - ) + session.write_transaction(self._add_descriptor, d, added_ids) except ConstraintError: logger.warning(f"{d['id']} exists already.") continue - for var_descr in data.get('variation_descriptors', []): + for var_descr in data.get("variation_descriptors", []): try: - session.write_transaction(self._add_variation_descriptor, - var_descr, added_ids) + session.write_transaction( + self._add_variation_descriptor, var_descr, added_ids + ) except ConstraintError: logger.warning(f"{var_descr['id']} exists already.") continue - for doc in data.get('documents'): + for doc in data.get("documents"): try: session.write_transaction(self._add_document, doc) except ConstraintError: logger.warning(f"{doc['id']} exists already.") continue - for proposition in data.get('propositions', []): + for proposition in data.get("propositions", []): try: - session.write_transaction(self._add_proposition, - proposition) + session.write_transaction(self._add_proposition, proposition) except ConstraintError: logger.warning(f"{proposition['id']} exists already.") continue - for s in data.get('statements', []): + for s in data.get("statements", []): loaded_count += 1 try: - session.write_transaction(self._add_statement, s, - added_ids) + session.write_transaction(self._add_statement, s, added_ids) except ConstraintError: logger.warning(f"{s['id']} exists already.") logger.info(f"Successfully loaded {loaded_count} statements.") @@ -165,22 +179,25 @@ def _add_method(tx, method: Dict, added_ids: Set[str]): `version`, and `authors` values. :param set added_ids: IDs found in statements """ - method['version'] = json.dumps(method['version']) + method["version"] = json.dumps(method["version"]) query = """ MERGE (n:Method {id:$id, label:$label, url:$url, version:$version, authors: $authors}); """ - if method['id'] in added_ids: + if method["id"] in added_ids: try: tx.run(query, **method) except ServiceUnavailable as exception: - logging.error(f"Failed to add Method object\nQuery: " - f"{query}\nAssertionMethod: {method}") + logging.error( + f"Failed to add Method object\nQuery: " + f"{query}\nAssertionMethod: {method}" + ) raise exception @staticmethod - def _update_fmt_key_extensions(fmt_keys: str, extensions: List[Dict], - obj: Dict) -> str: + def _update_fmt_key_extensions( + fmt_keys: str, extensions: List[Dict], obj: Dict + ) -> str: """Return an updated formatted string containing extensions data. Will mutate `obj` with extensions names + values. @@ -203,43 +220,45 @@ def _add_descriptor(self, tx, descriptor: Dict, added_ids: Set[str]): {'TherapyDescriptor', 'DiseaseDescriptor', 'GeneDescriptor'} :param set added_ids: IDs found in statements """ - if descriptor['id'] not in added_ids: + if descriptor["id"] not in added_ids: return - descr_type = descriptor['type'] - if descr_type == 'TherapyDescriptor': - value_type = 'Therapy' - elif descr_type == 'DiseaseDescriptor': - value_type = 'Disease' - elif descr_type == 'GeneDescriptor': - value_type = 'Gene' + descr_type = descriptor["type"] + if descr_type == "TherapyDescriptor": + value_type = "Therapy" + elif descr_type == "DiseaseDescriptor": + value_type = "Disease" + elif descr_type == "GeneDescriptor": + value_type = "Gene" else: raise TypeError(f"Invalid Descriptor type: {descr_type}") value_id = f"{value_type.lower()}_id" - descr_keys = _create_keys_string(descriptor, ('id', 'label', - 'description', 'xrefs', - 'alternate_labels')) + descr_keys = _create_keys_string( + descriptor, ("id", "label", "description", "xrefs", "alternate_labels") + ) - extensions = descriptor.get('extensions', []) + extensions = descriptor.get("extensions", []) if descr_type in {"TherapyDescriptor", "GeneDescriptor", "DiseaseDescriptor"}: - descr_keys = self._update_fmt_key_extensions(descr_keys, extensions, - descriptor) + descr_keys = self._update_fmt_key_extensions( + descr_keys, extensions, descriptor + ) - query = f''' + query = f""" MERGE (descr:{descr_type} {{ {descr_keys} }}) MERGE (value:{value_type} {{ id:${value_id} }}) MERGE (descr) -[:DESCRIBES]-> (value) - ''' + """ try: tx.run(query, **descriptor) except ServiceUnavailable as exception: - logging.error(f"Failed to add Descriptor object\nQuery: {query}\n" - f"Descriptor: {descriptor}") + logging.error( + f"Failed to add Descriptor object\nQuery: {query}\n" + f"Descriptor: {descriptor}" + ) raise exception @staticmethod - def _add_variation_descriptor(tx, descriptor_in: Dict, - added_ids: Set[str]): + def _add_variation_descriptor(tx, descriptor_in: Dict, added_ids: Set[str]): """Add variant descriptor object to DB. :param Dict descriptor_in: must include a `value_id` field and a `value` object containing `type`, `state`, and `location` objects. @@ -247,46 +266,55 @@ def _add_variation_descriptor(tx, descriptor_in: Dict, """ descriptor = descriptor_in.copy() - if descriptor['id'] not in added_ids: + if descriptor["id"] not in added_ids: return # prepare value properties - variation_type = descriptor['variation']['type'] - descriptor['variation'] = json.dumps(descriptor['variation']) + variation_type = descriptor["variation"]["type"] + descriptor["variation"] = json.dumps(descriptor["variation"]) # prepare descriptor properties - expressions = descriptor.get('expressions') + expressions = descriptor.get("expressions") if expressions: for expression in expressions: - syntax = expression['syntax'].split('.')[1] + syntax = expression["syntax"].split(".")[1] key = f"expressions_{syntax}" if key in descriptor: - descriptor[key].append(expression['value']) + descriptor[key].append(expression["value"]) else: - descriptor[key] = [expression['value']] - - nonnull_keys = [_create_keys_string(descriptor, - ('id', 'label', 'description', - 'xrefs', 'alternate_labels', - 'structural_type', - 'molecule_context', - 'expressions_c', - 'expressions_g', - 'expressions_p', - 'vrs_ref_allele_seq'))] + descriptor[key] = [expression["value"]] + + nonnull_keys = [ + _create_keys_string( + descriptor, + ( + "id", + "label", + "description", + "xrefs", + "alternate_labels", + "structural_type", + "molecule_context", + "expressions_c", + "expressions_g", + "expressions_p", + "vrs_ref_allele_seq", + ), + ) + ] # handle extensions variant_groups = None - extensions = descriptor.get('extensions') + extensions = descriptor.get("extensions") if extensions: for ext in extensions: - name = ext['name'] - if name == 'variant_group': - variant_groups = ext['value'] + name = ext["name"] + if name == "variant_group": + variant_groups = ext["value"] else: - descriptor[name] = json.dumps(ext['value']) + descriptor[name] = json.dumps(ext["value"]) nonnull_keys.append(f"{name}:${name}") - descriptor_keys = ', '.join(nonnull_keys) + descriptor_keys = ", ".join(nonnull_keys) query = f""" MERGE (descr:VariationDescriptor @@ -302,15 +330,17 @@ def _add_variation_descriptor(tx, descriptor_in: Dict, try: tx.run(query, **descriptor) except ServiceUnavailable as exception: - logging.error(f"Failed to add Variant Descriptor object\nQuery: " - f"{query}\nDescriptor: {descriptor}") + logging.error( + f"Failed to add Variant Descriptor object\nQuery: " + f"{query}\nDescriptor: {descriptor}" + ) raise exception if variant_groups: for grp in variant_groups: params = descriptor.copy() - params['group_id'] = grp['id'] - params['group_label'] = grp['label'] - params['group_description'] = grp.get('description', '') + params["group_id"] = grp["id"] + params["group_label"] = grp["label"] + params["group_description"] = grp.get("description", "") query = f""" MERGE (grp:VariationGroup {{id:$group_id, label:$group_label, @@ -321,8 +351,10 @@ def _add_variation_descriptor(tx, descriptor_in: Dict, try: tx.run(query, **params) except ServiceUnavailable as exception: - logging.error(f"Failed to add Variant Descriptor object\n" - f"Query: {query}\nDescriptor: {descriptor}") + logging.error( + f"Failed to add Variant Descriptor object\n" + f"Query: {query}\nDescriptor: {descriptor}" + ) raise exception @staticmethod @@ -332,9 +364,8 @@ def _add_proposition(tx, proposition: Dict): :param Dict proposition: must include `disease_context`, `therapy`, and `has_originating_context` fields. """ - formatted_keys = _create_keys_string(proposition, ('id', 'predicate', - 'type')) - prop_type = proposition.get('type') + formatted_keys = _create_keys_string(proposition, ("id", "predicate", "type")) + prop_type = proposition.get("type") if prop_type == "therapeutic_response_proposition": prop_label = ":TherapeuticResponse" therapy_obj = "MERGE (therapy:Therapy {id:$object})" @@ -365,8 +396,10 @@ def _add_proposition(tx, proposition: Dict): try: tx.run(query, **proposition) except ServiceUnavailable as exception: - logging.error(f"Failed to add Proposition object\n" - f"Query: {query}\nProposition: {proposition}") + logging.error( + f"Failed to add Proposition object\n" + f"Query: {query}\nProposition: {proposition}" + ) raise exception @staticmethod @@ -378,26 +411,28 @@ def _add_document(tx, document: Dict): query = "MATCH (n:Document {id:$id}) RETURN n" result = tx.run(query, **document) except ServiceUnavailable as exception: - logging.error(f"Failed to read Document object\n" - f"Query: {query}\nDocument: " - f"{document}") + logging.error( + f"Failed to read Document object\n" + f"Query: {query}\nDocument: " + f"{document}" + ) raise exception if not result.single(): - formatted_keys = _create_keys_string(document, - ('id', 'label', - 'document_id', - 'xrefs', - 'description')) + formatted_keys = _create_keys_string( + document, ("id", "label", "document_id", "xrefs", "description") + ) query = f""" MERGE (n:Document {{ {formatted_keys} }}); """ try: tx.run(query, **document) except ServiceUnavailable as exception: - logging.error(f"Failed to add Document object\n" - f"Query: {query}\nDocument: " - f"{document}") + logging.error( + f"Failed to add Document object\n" + f"Query: {query}\nDocument: " + f"{document}" + ) raise exception def _get_ids_from_statement(self, statement, added_ids): @@ -406,10 +441,12 @@ def _get_ids_from_statement(self, statement, added_ids): :param Node statement: Statement node :param set added_ids: IDs found in statements """ - for node_id in [statement.get('therapy_descriptor'), - statement.get('variation_descriptor'), - statement.get('disease_descriptor'), - statement.get('method')]: + for node_id in [ + statement.get("therapy_descriptor"), + statement.get("variation_descriptor"), + statement.get("disease_descriptor"), + statement.get("method"), + ]: if node_id: added_ids.add(node_id) @@ -420,18 +457,19 @@ def _add_statement(self, tx, statement: Dict, added_ids: Set[str]): as optional `therapy_descriptor` field :param set added_ids: IDs found in statements """ - formatted_keys = _create_keys_string(statement, ('id', 'description', - 'direction', - 'variation_origin', - 'evidence_level')) + formatted_keys = _create_keys_string( + statement, + ("id", "description", "direction", "variation_origin", "evidence_level"), + ) extensions = statement.get("extensions", []) - formatted_keys = self._update_fmt_key_extensions(formatted_keys, extensions, - statement) + formatted_keys = self._update_fmt_key_extensions( + formatted_keys, extensions, statement + ) match_line = "" rel_line = "" - supported_by = statement.get('supported_by', []) + supported_by = statement.get("supported_by", []) if supported_by: for i, ev in enumerate(supported_by): name = f"doc_{i}" @@ -439,10 +477,11 @@ def _add_statement(self, tx, statement: Dict, added_ids: Set[str]): match_line += f"MERGE ({name} {{ id:${name} }})\n" rel_line += f"MERGE (ev) -[:CITES]-> ({name})\n" - td = statement.get('therapy_descriptor') + td = statement.get("therapy_descriptor") if td: - therapy_descriptor = \ - f"MERGE (ther:TherapyDescriptor {{id:$therapy_descriptor}})" # noqa: F541, E501 + therapy_descriptor = ( + f"MERGE (ther:TherapyDescriptor {{id:$therapy_descriptor}})" # noqa: F541 + ) therapy_obj = f"MERGE (ev) -[:HAS_THERAPY]-> (ther)" # noqa: F541 added_ids.add(td) else: @@ -467,55 +506,52 @@ def _add_statement(self, tx, statement: Dict, added_ids: Set[str]): try: tx.run(query, **statement) except ServiceUnavailable as exception: - logging.error(f"Failed to add Evidence object\n" - f"Query: {query}\nEvidence: {statement}") + logging.error( + f"Failed to add Evidence object\n" + f"Query: {query}\nEvidence: {statement}" + ) raise exception @staticmethod def get_secret(): """Get secrets for MetaKB instances.""" - secret_name = environ['METAKB_DB_SECRET'] + secret_name = environ["METAKB_DB_SECRET"] region_name = "us-east-2" # Create a Secrets Manager client session = boto3.session.Session() - client = session.client( - service_name='secretsmanager', - region_name=region_name - ) + client = session.client(service_name="secretsmanager", region_name=region_name) try: - get_secret_value_response = client.get_secret_value( - SecretId=secret_name - ) + get_secret_value_response = client.get_secret_value(SecretId=secret_name) except ClientError as e: logger.warning(e) - if e.response['Error']['Code'] == 'DecryptionFailureException': + if e.response["Error"]["Code"] == "DecryptionFailureException": # Secrets Manager can't decrypt the protected # secret text using the provided KMS key. raise e - elif e.response['Error']['Code'] == \ - 'InternalServiceErrorException': + elif e.response["Error"]["Code"] == "InternalServiceErrorException": # An error occurred on the server side. raise e - elif e.response['Error']['Code'] == 'InvalidParameterException': + elif e.response["Error"]["Code"] == "InvalidParameterException": # You provided an invalid value for a parameter. raise e - elif e.response['Error']['Code'] == 'InvalidRequestException': + elif e.response["Error"]["Code"] == "InvalidRequestException": # You provided a parameter value that is not valid for # the current state of the resource. raise e - elif e.response['Error']['Code'] == 'ResourceNotFoundException': + elif e.response["Error"]["Code"] == "ResourceNotFoundException": # We can't find the resource that you asked for. raise e else: # Decrypts secret using the associated KMS CMK. # Depending on whether the secret is a string or binary, # one of these fields will be populated. - if 'SecretString' in get_secret_value_response: - secret = get_secret_value_response['SecretString'] + if "SecretString" in get_secret_value_response: + secret = get_secret_value_response["SecretString"] return secret else: decoded_binary_secret = base64.b64decode( - get_secret_value_response['SecretBinary']) + get_secret_value_response["SecretBinary"] + ) return decoded_binary_secret diff --git a/metakb/delta.py b/metakb/delta.py index 6e32ede0..e69c501b 100644 --- a/metakb/delta.py +++ b/metakb/delta.py @@ -3,21 +3,20 @@ import logging from datetime import date -from jsondiff import diff from civicpy.__version__ import __version__ as civicpy_version +from jsondiff import diff from metakb import APP_ROOT +from metakb.harvesters import CIViCHarvester, MOAHarvester, OncoKBHarvester from metakb.schemas import SourceName from metakb.version import __version__ -from metakb.harvesters import CIViCHarvester, MOAHarvester, OncoKBHarvester - HARVESTER_CLASS = { SourceName.CIVIC.value: CIViCHarvester, SourceName.MOA.value: MOAHarvester, - SourceName.ONCOKB.value: OncoKBHarvester + SourceName.ONCOKB.value: OncoKBHarvester, } -logger = logging.getLogger('metakb.delta') +logger = logging.getLogger("metakb.delta") logger.setLevel(logging.DEBUG) @@ -33,9 +32,9 @@ def __init__(self, main_json, src, *args, **kwargs): self._src = src.lower() assert self._src in HARVESTER_CLASS.keys() self._main_json = main_json - if '_updated_json' in kwargs: + if "_updated_json" in kwargs: # The path to the updated harvester composite json file. - self._updated_json = kwargs['_updated_json'] + self._updated_json = kwargs["_updated_json"] else: self._updated_json = None @@ -46,57 +45,49 @@ def compute_delta(self): harvester. """ # Main harvester - with open(self._main_json, 'r') as f: + with open(self._main_json, "r") as f: main_json = json.load(f) - current_date = date.today().strftime('%Y%m%d') + current_date = date.today().strftime("%Y%m%d") # updated harvester if self._updated_json: # Updated harvester file already exists - with open(self._updated_json, 'r') as f: + with open(self._updated_json, "r") as f: updated_json = json.load(f) else: # Want to create updated harvester file fn = f"{self._src}_harvester_{current_date}.json" HARVESTER_CLASS[self._src]().harvest(filename=fn) - with open(f"{APP_ROOT}/data/{self._src}/harvester/{fn}", 'r') as f: + with open(f"{APP_ROOT}/data/{self._src}/harvester/{fn}", "r") as f: updated_json = json.load(f) delta = { - '_meta': { - 'metakb_version': __version__, - 'date_harvested': current_date - } + "_meta": {"metakb_version": __version__, "date_harvested": current_date} } if self._src == SourceName.CIVIC: - delta['_meta']['civicpy_version'] = civicpy_version + delta["_meta"]["civicpy_version"] = civicpy_version elif self._src == SourceName.MOA: - delta['_meta']['moa_api_version'] = '0.2' + delta["_meta"]["moa_api_version"] = "0.2" elif self._src == SourceName.ONCOKB: delta["_meta"]["oncokb_app_version"] = updated_json["appVersion"]["version"] delta["_meta"]["oncokb_api_version"] = updated_json["apiVersion"]["version"] - delta["_meta"]["oncokb_data_version"] = \ - updated_json["dataVersion"]["version"] + delta["_meta"]["oncokb_data_version"] = updated_json["dataVersion"][ + "version" + ] for record_type in main_json.keys(): - delta[record_type] = { - 'DELETE': [], - 'INSERT': [], - 'UPDATE': [] - } + delta[record_type] = {"DELETE": [], "INSERT": [], "UPDATE": []} updated = updated_json[record_type] main = main_json[record_type] updated_ids = self._get_ids(updated) main_ids = self._get_ids(main) additional_ids = list(set(updated_ids) - set(main_ids)) - self._ins_del_delta(delta, record_type, 'INSERT', additional_ids, - updated) + self._ins_del_delta(delta, record_type, "INSERT", additional_ids, updated) remove_ids = list(set(main_ids) - set(updated_ids)) - self._ins_del_delta(delta, record_type, 'DELETE', remove_ids, - main) + self._ins_del_delta(delta, record_type, "DELETE", remove_ids, main) self._update_delta(delta, record_type, updated, main) @@ -113,7 +104,7 @@ def _ins_del_delta(self, delta, record_type, key, ids_list, data): :param dict data: Harvester data """ for record in data: - if record['id'] in ids_list: + if record["id"] in ids_list: delta[record_type][key].append(record) def _update_delta(self, delta, record_type, updated, main): @@ -126,12 +117,15 @@ def _update_delta(self, delta, record_type, updated, main): """ for updated_record in updated: for main_record in main: - if main_record['id'] == updated_record['id']: + if main_record["id"] == updated_record["id"]: if updated_record != main_record: - delta[record_type]['UPDATE'].append({ - str(main_record['id']): - diff(main_record, updated_record, marshal=True) - }) + delta[record_type]["UPDATE"].append( + { + str(main_record["id"]): diff( + main_record, updated_record, marshal=True + ) + } + ) break def _get_ids(self, records): @@ -142,7 +136,7 @@ def _get_ids(self, records): """ ids = list() for r in records: - r_id = r['id'] + r_id = r["id"] if r_id not in ids: ids.append(r_id) return ids @@ -153,9 +147,8 @@ def _create_json(self, delta, current_date): :param dict delta: A dictionary containing deltas. :param str current_date: The current date """ - src_dir = APP_ROOT / 'data' / self._src / 'delta' + src_dir = APP_ROOT / "data" / self._src / "delta" src_dir.mkdir(exist_ok=True, parents=True) - with open(f"{src_dir}/{self._src}_deltas_{current_date}.json", - 'w+') as f: + with open(f"{src_dir}/{self._src}_deltas_{current_date}.json", "w+") as f: json.dump(delta, f, indent=4) diff --git a/metakb/exceptions.py b/metakb/exceptions.py index d04a5d1a..fe8b4a1c 100644 --- a/metakb/exceptions.py +++ b/metakb/exceptions.py @@ -1,7 +1,7 @@ """Define exceptions.""" -class NormalizationException(Exception): +class NormalizationException(Exception): # noqa: N818 """Indicate failure to normalize term.""" pass diff --git a/metakb/harvesters/__init__.py b/metakb/harvesters/__init__.py index 4745bbc9..a7f790b3 100644 --- a/metakb/harvesters/__init__.py +++ b/metakb/harvesters/__init__.py @@ -1,5 +1 @@ """A package for metakb harvester routines.""" -from .base import Harvester -from .civic import CIViCHarvester -from .moa import MOAHarvester -from .oncokb import OncoKBHarvester diff --git a/metakb/harvesters/base.py b/metakb/harvesters/base.py index 8ffae15a..66ef8071 100644 --- a/metakb/harvesters/base.py +++ b/metakb/harvesters/base.py @@ -1,8 +1,8 @@ """A module for the Harvester base class""" -from typing import List, Dict, Optional import json import logging from datetime import datetime as dt +from typing import Dict, List, Optional from metakb import APP_ROOT, DATE_FMT @@ -14,8 +14,7 @@ class Harvester: """A base class for content harvesters.""" def harvest(self) -> bool: - """ - Retrieve and store records from a resource. Records may be stored in + """Retrieve and store records from a resource. Records may be stored in any manner, but must be retrievable by :method:`iterate_records`. :return: `True` if operation was successful, `False` otherwise. @@ -23,8 +22,9 @@ def harvest(self) -> bool: """ raise NotImplementedError - def create_json(self, items: Dict[str, List], - filename: Optional[str] = None) -> bool: + def create_json( + self, items: Dict[str, List], filename: Optional[str] = None + ) -> bool: """Create composite and individual JSON for harvested data. :param Dict items: item types keyed to Lists of values diff --git a/metakb/harvesters/civic.py b/metakb/harvesters/civic.py index 7e571e72..c3fe8c16 100644 --- a/metakb/harvesters/civic.py +++ b/metakb/harvesters/civic.py @@ -1,11 +1,12 @@ """A module for the CIViC harvester.""" import logging -from typing import Dict, List, Optional from pathlib import Path +from typing import Dict, List, Optional -from civicpy import civic as civicpy, LOCAL_CACHE_PATH +from civicpy import LOCAL_CACHE_PATH +from civicpy import civic as civicpy -from metakb.harvesters.base import Harvester # noqa: I202 +from metakb.harvesters.base import Harvester logger = logging.getLogger(__name__) @@ -17,7 +18,7 @@ def __init__( self, update_cache: bool = False, update_from_remote: bool = True, - local_cache_path: Optional[Path] = LOCAL_CACHE_PATH + local_cache_path: Optional[Path] = LOCAL_CACHE_PATH, ) -> None: """Initialize CIViCHarvester class. @@ -62,9 +63,9 @@ def harvest(self, filename: Optional[str] = None) -> bool: "genes": self.genes, "variants": self.variants, "molecular_profiles": self.molecular_profiles, - "assertions": self.assertions + "assertions": self.assertions, }, - filename + filename, ) if not json_created: logger.error( @@ -130,7 +131,8 @@ def _dictify(self, obj: any) -> Dict: if isinstance(obj, civicpy.CivicRecord): return { k: self._dictify(v) - for k, v in obj.__dict__.items() if not k.startswith(("_", "partial")) + for k, v in obj.__dict__.items() + if not k.startswith(("_", "partial")) } if isinstance(obj, list): diff --git a/metakb/harvesters/moa.py b/metakb/harvesters/moa.py index 816f26be..a2133838 100644 --- a/metakb/harvesters/moa.py +++ b/metakb/harvesters/moa.py @@ -1,12 +1,11 @@ """A module for the Molecular Oncology Almanac harvester""" import logging -from typing import Optional, List, Dict +from typing import Dict, List, Optional import requests import requests_cache -from metakb.harvesters.base import Harvester # noqa: I202 - +from metakb.harvesters.base import Harvester logger = logging.getLogger("metakb.harvesters.moa") logger.setLevel(logging.DEBUG) @@ -16,8 +15,7 @@ class MOAHarvester(Harvester): """A class for the Molecular Oncology Almanac harvester.""" def harvest(self, filename: Optional[str] = None) -> bool: - """ - Retrieve and store sources, variants, and assertions + """Retrieve and store sources, variants, and assertions records from MOAlmanac in composite and individual JSON files. :param Optional[str] filename: File name for composite json @@ -35,14 +33,14 @@ def harvest(self, filename: Optional[str] = None) -> bool: "assertions": assertions, "sources": sources, "variants": variants, - "genes": genes + "genes": genes, }, - filename + filename, ) if not json_created: logger.error("MOAlmanac Harvester was not successful.") return False - except Exception as e: # noqa: E722 + except Exception as e: logger.error(f"MOAlmanac Harvester was not successful: {e}") return False else: @@ -63,8 +61,7 @@ def _harvest_genes() -> List[Dict]: return genes def _harvest_sources(self, assertion_resp: List[Dict]) -> List[Dict]: - """ - Harvest all MOA sources + """Harvest all MOA sources :param List[Dict] assertion_resp: A list of MOA assertion records :return: A list of sources @@ -81,8 +78,7 @@ def _harvest_sources(self, assertion_resp: List[Dict]) -> List[Dict]: return sources def harvest_variants(self) -> List[Dict]: - """ - Harvest all MOA variants + """Harvest all MOA variants :return: A list of variants :rtype: list @@ -96,10 +92,10 @@ def harvest_variants(self) -> List[Dict]: return variants, variants_list - def harvest_assertions(self, assertion_resp: List[Dict], - variants_list: List[Dict]) -> List[Dict]: - """ - Harvest all MOA assertions + def harvest_assertions( + self, assertion_resp: List[Dict], variants_list: List[Dict] + ) -> List[Dict]: + """Harvest all MOA assertions :param List[Dict] assertion_resp: A list of MOA assertion records :param List[Dict] variants_list: A list of MOA variant records @@ -114,8 +110,7 @@ def harvest_assertions(self, assertion_resp: List[Dict], return assertions def _get_all_assertions(self) -> List[Dict]: - """ - Return all assertion records. + """Return all assertion records. :return: All moa assertion records """ @@ -126,8 +121,7 @@ def _get_all_assertions(self) -> List[Dict]: return assertions def _get_all_variants(self) -> List[Dict]: - """ - Return all variant records + """Return all variant records :return: All moa variant records """ @@ -138,8 +132,7 @@ def _get_all_variants(self) -> List[Dict]: return variants def _source_item(self, source: Dict) -> Dict: - """ - Harvest an individual MOA source of evidence + """Harvest an individual MOA source of evidence :param Dict source: source record of each assertion record :return: a dictionary containing MOA source of evidence data @@ -152,7 +145,7 @@ def _source_item(self, source: Dict) -> Dict: "nct": source["nct"], "pmid": source["pmid"], "url": source["url"], - "citation": source["citation"] + "citation": source["citation"], } return source_record @@ -163,18 +156,15 @@ def _harvest_variant(self, variant: Dict) -> Dict: :return: A dictionary containing MOA variant data :rtype: dict """ - variant_record = { - "id": variant["feature_id"] - } + variant_record = {"id": variant["feature_id"]} - variant_record.update({k: v for k, v in variant["attributes"][0].items()}) # noqa: E501 + variant_record.update({k: v for k, v in variant["attributes"][0].items()}) variant_record.update(self._get_feature(variant_record)) return variant_record def _harvest_assertion(self, assertion: Dict, variants_list: List[Dict]) -> Dict: - """ - Harvest an individual MOA assertion record + """Harvest an individual MOA assertion record :param Dict assertion: a MOA assertion record :param List[Dict] variants_list: a list of MOA variant records @@ -188,20 +178,20 @@ def _harvest_assertion(self, assertion: Dict, variants_list: List[Dict]) -> Dict "disease": { "name": assertion["disease"], "oncotree_code": assertion["oncotree_code"], - "oncotree_term": assertion["oncotree_term"] + "oncotree_term": assertion["oncotree_term"], }, "therapy_name": assertion["therapy_name"], "therapy_type": assertion["therapy_type"], "clinical_significance": self._get_therapy( - assertion["therapy_resistance"], - assertion["therapy_sensitivity"]), + assertion["therapy_resistance"], assertion["therapy_sensitivity"] + ), "predictive_implication": assertion["predictive_implication"], "favorable_prognosis": assertion["favorable_prognosis"], "created_on": assertion["created_on"], "last_updated": assertion["last_updated"], "submitted_by": assertion["submitted_by"], "validated": assertion["validated"], - "source_ids": assertion["sources"][0]["source_id"] + "source_ids": assertion["sources"][0]["source_id"], } for v in variants_list: @@ -211,8 +201,7 @@ def _harvest_assertion(self, assertion: Dict, variants_list: List[Dict]) -> Dict return assertion_record def _get_therapy(self, resistance: bool, sensitivity: bool) -> Optional[str]: - """ - Get therapy response data. + """Get therapy response data. :param bool resistance: `True` if Therapy Resistance. `False` if not Therapy Resistance @@ -229,8 +218,7 @@ def _get_therapy(self, resistance: bool, sensitivity: bool) -> Optional[str]: return None def _get_feature(self, v: Dict) -> Dict: - """ - Get feature name from the harvested variants + """Get feature name from the harvested variants :param Dict v: harvested MOA variant :return: feature name same format as displayed in moalmanac.org @@ -238,19 +226,21 @@ def _get_feature(self, v: Dict) -> Dict: """ feature_type = v["feature_type"] if feature_type == "rearrangement": - feature = "{}{}{}".format(v["gene1"], - f"--{v['gene2']}" if v["gene2"] else "", - f" {v['rearrangement_type']}" - if v["rearrangement_type"] else "") + feature = "{}{}{}".format( + v["gene1"], + f"--{v['gene2']}" if v["gene2"] else "", + f" {v['rearrangement_type']}" if v["rearrangement_type"] else "", + ) elif feature_type == "somatic_variant": - feature = "{}{}{}".format(v["gene"], - f" {v['protein_change']}" - if v["protein_change"] else "", - f" ({v['variant_annotation']})" - if v["variant_annotation"] else "") + feature = "{}{}{}".format( + v["gene"], + f" {v['protein_change']}" if v["protein_change"] else "", + f" ({v['variant_annotation']})" if v["variant_annotation"] else "", + ) elif feature_type == "germline_variant": - feature = "{}{}".format(v["gene"], " (Pathogenic)" - if v["pathogenic"] == "1.0" else "") + feature = "{}{}".format( + v["gene"], " (Pathogenic)" if v["pathogenic"] == "1.0" else "" + ) elif feature_type == "copy_number": feature = "{} {}".format(v["gene"], v["direction"]) elif feature_type == "microsatellite_stability": @@ -262,15 +252,18 @@ def _get_feature(self, v: Dict) -> Dict: clss = v["classification"] min_mut = v["minimum_mutations"] mut_per_mb = v["mutations_per_mb"] - feature = "{}{}".format(clss, - f" (>= {min_mut} mutations)" if min_mut - else (f" (>= {mut_per_mb} mutations/Mb)" - if mut_per_mb else "")) + feature = "{}{}".format( + clss, + f" (>= {min_mut} mutations)" + if min_mut + else (f" (>= {mut_per_mb} mutations/Mb)" if mut_per_mb else ""), + ) elif feature_type == "neoantigen_burden": feature = "{}".format(v["classification"]) elif feature_type == "knockdown" or feature_type == "silencing": - feature = "{}{}".format(v["gene"], f" ({v['technique']})" - if v["technique"] else "") + feature = "{}{}".format( + v["gene"], f" ({v['technique']})" if v["technique"] else "" + ) else: feature = "{}".format(v["event"]) diff --git a/metakb/harvesters/oncokb.py b/metakb/harvesters/oncokb.py index cf0f647c..5847144a 100644 --- a/metakb/harvesters/oncokb.py +++ b/metakb/harvesters/oncokb.py @@ -1,21 +1,20 @@ """Module for harvesting data from OncoKB""" -import logging import csv -from pathlib import Path -from typing import Dict, List, Union, Optional -from os import environ +import logging from enum import Enum +from os import environ +from pathlib import Path +from typing import Dict, List, Optional, Union import requests from metakb.harvesters.base import Harvester - logger = logging.getLogger("metakb.harvesters.oncokb") logger.setLevel(logging.DEBUG) -class OncoKBHarvesterException(Exception): +class OncoKBHarvesterException(Exception): # noqa: N818 """OncoKB Harvester Exceptions""" pass @@ -47,12 +46,14 @@ def __init__(self, api_token: Optional[str] = None) -> None: if not self.api_token: raise OncoKBHarvesterException( "Access to OncoKB data via REST API requires an api token. You can set " - "it during initialization (e.g., OncoKBHarvester(api_token={API_TOKEN})). " # noqa: E501 + "it during initialization (e.g., OncoKBHarvester(api_token={API_TOKEN})). " "or by setting the `ONCOKB_API_TOKEN` environment variable. For getting" - " an API token, visit https://www.oncokb.org/apiAccess.") + " an API token, visit https://www.oncokb.org/apiAccess." + ) - def harvest(self, variants_by_protein_change_path: Path, - filename: Optional[str] = None) -> bool: + def harvest( + self, variants_by_protein_change_path: Path, filename: Optional[str] = None + ) -> bool: """Retrieve and store gene and variant and its associated evidence from OncoKB in composite and individual JSON files. @@ -70,10 +71,18 @@ def harvest(self, variants_by_protein_change_path: Path, self.genes = self.harvest_genes() self.variants = self.harvest_variants(variants_by_protein_change_path) self.metadata = self.get_metadata() - self.diagnostic_levels = self._get_api_response(f"/levels/{OncoKBLevels.DIAGNOSTIC.value}") # noqa: E501 - self.prognostic_levels = self._get_api_response(f"/levels/{OncoKBLevels.PROGNOSTIC.value}") # noqa: E501 - self.resistance_levels = self._get_api_response(f"/levels/{OncoKBLevels.RESISTANCE.value}") # noqa: E501 - self.sensitive_levels = self._get_api_response(f"/levels/{OncoKBLevels.SENSITIVE.value}") # noqa: E501 + self.diagnostic_levels = self._get_api_response( + f"/levels/{OncoKBLevels.DIAGNOSTIC.value}" + ) + self.prognostic_levels = self._get_api_response( + f"/levels/{OncoKBLevels.PROGNOSTIC.value}" + ) + self.resistance_levels = self._get_api_response( + f"/levels/{OncoKBLevels.RESISTANCE.value}" + ) + self.sensitive_levels = self._get_api_response( + f"/levels/{OncoKBLevels.SENSITIVE.value}" + ) json_created = self.create_json( { @@ -84,16 +93,17 @@ def harvest(self, variants_by_protein_change_path: Path, "prognostic": self.prognostic_levels, "resistance": self.resistance_levels, "sensitive": self.sensitive_levels, - "fda": self.fda_levels + "fda": self.fda_levels, }, - "metadata": self.metadata - }, filename + "metadata": self.metadata, + }, + filename, ) if json_created: harvest_successful = True else: logger.error("OncoKB Harvester was not successful") - except Exception as e: # noqa: E722 + except Exception as e: logger.error(f"OncoKB Harvester was not successful: {e}") return harvest_successful @@ -164,8 +174,10 @@ def _harvest_protein_change_variants( reader = csv.reader(f) next(reader) # skip header for symbol, p_change in reader: - endpoint = f"/annotate/mutations/byProteinChange?hugoSymbol={symbol}&"\ - f"alteration={p_change}&referenceGenome=GRCh38" + endpoint = ( + f"/annotate/mutations/byProteinChange?hugoSymbol={symbol}&" + f"alteration={p_change}&referenceGenome=GRCh38" + ) resp = self._get_api_response(endpoint) if resp: variants.append(resp) diff --git a/metakb/main.py b/metakb/main.py index 8b23187a..376d3ea7 100644 --- a/metakb/main.py +++ b/metakb/main.py @@ -1,16 +1,17 @@ """Main application for FastAPI.""" -from fastapi import FastAPI, Query, Path +from typing import Optional + +from fastapi import FastAPI, Path, Query from fastapi.openapi.utils import get_openapi + from metakb.query import QueryHandler +from metakb.schemas import SearchIDService, SearchService, SearchStatementsService from metakb.version import __version__ -from metakb.schemas import SearchService, SearchIDService, \ - SearchStatementsService -from typing import Optional app = FastAPI( - docs_url='/api/v2', - openapi_url='/api/v2/openapi.json', - swagger_ui_parameters={"tryItOutEnabled": True} + docs_url="/api/v2", + openapi_url="/api/v2/openapi.json", + swagger_ui_parameters={"tryItOutEnabled": True}, ) query = QueryHandler() @@ -23,26 +24,29 @@ def custom_openapi(): title="The VICC Meta-Knowledgebase", version=__version__, description="A search interface for cancer variant interpretations" - " assembled by aggregating and harmonizing across multiple" - " cancer variant interpretation knowledgebases.", - routes=app.routes + " assembled by aggregating and harmonizing across multiple" + " cancer variant interpretation knowledgebases.", + routes=app.routes, ) - openapi_schema['info']['contact'] = { + openapi_schema["info"]["contact"] = { "name": "VICC", "email": "help@cancervariants.org", - "url": "https://cancervariants.org" + "url": "https://cancervariants.org", } app.openapi_schema = openapi_schema return app.openapi_schema app.openapi = custom_openapi -search_summary = ("Given variation, disease, therapy, and/or gene, " - "return associated statements and propositions.") +search_summary = ( + "Given variation, disease, therapy, and/or gene, " + "return associated statements and propositions." +) search_response_description = "A response to a validly-formed query." -search_description = ("Return statements and propositions associated" - " to the queried concepts.") +search_description = ( + "Return statements and propositions associated" " to the queried concepts." +) v_description = "Variation (subject) to search" d_description = "Disease (object qualifier) to search" t_description = "Therapy (object) to search" @@ -51,19 +55,21 @@ def custom_openapi(): detail_description = "Display all descriptors, methods, and documents." -@app.get('/api/v2/search', - summary=search_summary, - response_description=search_response_description, - response_model=SearchService, - description=search_description, - response_model_exclude_none=True) +@app.get( + "/api/v2/search", + summary=search_summary, + response_description=search_response_description, + response_model=SearchService, + description=search_description, + response_model_exclude_none=True, +) async def search( variation: Optional[str] = Query(None, description=v_description), disease: Optional[str] = Query(None, description=d_description), therapy: Optional[str] = Query(None, description=t_description), gene: Optional[str] = Query(None, description=g_description), statement_id: Optional[str] = Query(None, description=s_description), - detail: Optional[bool] = Query(False, description=detail_description) + detail: Optional[bool] = Query(False, description=detail_description), ): """Search endpoint""" resp = await query.search(variation, disease, therapy, gene, statement_id, detail) @@ -72,42 +78,50 @@ async def search( search_statements_summary = ( "Given variation, disease, therapy, and/or gene, return associated " - "nested statements containing propositions and descriptors.") + "nested statements containing propositions and descriptors." +) search_statement_response_descr = "A response to a validly-formed query." -search_statements_descr = ( - "Return nested statements associated to the queried concepts.") +search_statements_descr = "Return nested statements associated to the queried concepts." -@app.get('/api/v2/search/statements', - summary=search_statements_summary, - response_description=search_statement_response_descr, - response_model=SearchStatementsService, - description=search_statements_descr, - response_model_exclude_none=True) +@app.get( + "/api/v2/search/statements", + summary=search_statements_summary, + response_description=search_statement_response_descr, + response_model=SearchStatementsService, + description=search_statements_descr, + response_model_exclude_none=True, +) async def get_statements( - variation: Optional[str] = Query(None, description=v_description), - disease: Optional[str] = Query(None, description=d_description), - therapy: Optional[str] = Query(None, description=t_description), - gene: Optional[str] = Query(None, description=g_description), - statement_id: Optional[str] = Query(None, description=s_description)): + variation: Optional[str] = Query(None, description=v_description), + disease: Optional[str] = Query(None, description=d_description), + therapy: Optional[str] = Query(None, description=t_description), + gene: Optional[str] = Query(None, description=g_description), + statement_id: Optional[str] = Query(None, description=s_description), +): """Return nested statements for queried concepts""" - resp = await query.search_statements(variation, disease, therapy, gene, - statement_id) + resp = await query.search_statements( + variation, disease, therapy, gene, statement_id + ) return resp -id_query_desc = ("Given Meta-KB statement_id, proposition_id, descriptor_id," - " document_id, or method_id return the node content.") -id_search_description = ("Return node of the queried node id.") +id_query_desc = ( + "Given Meta-KB statement_id, proposition_id, descriptor_id," + " document_id, or method_id return the node content." +) +id_search_description = "Return node of the queried node id." id_description = "Node ID to search" -@app.get('/api/v2/search/{id}', - summary=id_query_desc, - response_description=search_response_description, - response_model=SearchIDService, - description=id_search_description, - response_model_exclude_none=True) +@app.get( + "/api/v2/search/{id}", + summary=id_query_desc, + response_description=search_response_description, + response_model=SearchIDService, + description=id_search_description, + response_model_exclude_none=True, +) async def search_by_id(id: str = Path(description=id_description)): """Search by ID endpoint""" return query.search_by_id(id) diff --git a/metakb/normalizers.py b/metakb/normalizers.py index 3f14b6d1..90339589 100644 --- a/metakb/normalizers.py +++ b/metakb/normalizers.py @@ -2,16 +2,16 @@ import logging from typing import List, Optional, Tuple -from ga4gh.vrsatile.pydantic.vrsatile_models import VariationDescriptor, Extension -from variation.query import QueryHandler as VariationQueryHandler -from therapy.query import QueryHandler as TherapyQueryHandler -from therapy.schemas import NormalizationService as NormalizedTherapy, ApprovalRating from disease.query import QueryHandler as DiseaseQueryHandler from disease.schemas import NormalizationService as NormalizedDisease +from ga4gh.vrsatile.pydantic.vrsatile_models import Extension, VariationDescriptor from gene.database.dynamodb import DynamoDbDatabase from gene.query import QueryHandler as GeneQueryHandler from gene.schemas import NormalizeService as NormalizedGene - +from therapy.query import QueryHandler as TherapyQueryHandler +from therapy.schemas import ApprovalRating +from therapy.schemas import NormalizationService as NormalizedTherapy +from variation.query import QueryHandler as VariationQueryHandler logger = logging.getLogger(__name__) @@ -20,10 +20,11 @@ class VICCNormalizers: """A class for normalizing terms using VICC normalizers.""" def __init__( - self, gene_query_handler: Optional[GeneQueryHandler] = None, + self, + gene_query_handler: Optional[GeneQueryHandler] = None, variation_query_handler: Optional[VariationQueryHandler] = None, disease_query_handler: Optional[DiseaseQueryHandler] = None, - therapy_query_handler: Optional[TherapyQueryHandler] = None + therapy_query_handler: Optional[TherapyQueryHandler] = None, ) -> None: """Initialize the VICC Normalizers. @@ -34,7 +35,9 @@ def __init__( """ self.disease_query_handler = disease_query_handler or DiseaseQueryHandler() self.therapy_query_handler = therapy_query_handler or TherapyQueryHandler() - self.gene_query_handler = (gene_query_handler or GeneQueryHandler(DynamoDbDatabase())) # noqa: E501 + self.gene_query_handler = gene_query_handler or GeneQueryHandler( + DynamoDbDatabase() + ) if variation_query_handler: self.variation_query_handler = variation_query_handler @@ -56,12 +59,18 @@ async def normalize_variation( if not query: continue try: - variation_norm_resp = await self.variation_query_handler.normalize_handler.normalize(query) # noqa: E501 + variation_norm_resp = ( + await self.variation_query_handler.normalize_handler.normalize( + query + ) + ) if variation_norm_resp and variation_norm_resp.variation_descriptor: return variation_norm_resp.variation_descriptor - except Exception as e: # noqa: E722 - logger.warning(f"Variation Normalizer raised an exception using query" - f" {query}: {e}") + except Exception as e: + logger.warning( + f"Variation Normalizer raised an exception using query" + f" {query}: {e}" + ) return None def normalize_gene( @@ -82,13 +91,14 @@ def normalize_gene( try: gene_norm_resp = self.gene_query_handler.normalize(query_str) except Exception as e: - logger.warning(f"Gene Normalizer raised an exception using query " - f"{query_str}: {e}") + logger.warning( + f"Gene Normalizer raised an exception using query " + f"{query_str}: {e}" + ) else: if gene_norm_resp.match_type > highest_match: highest_match = gene_norm_resp.match_type - normalized_gene_id = \ - gene_norm_resp.gene_descriptor.gene_id + normalized_gene_id = gene_norm_resp.gene_descriptor.gene_id if highest_match == 100: break return gene_norm_resp, normalized_gene_id @@ -112,13 +122,16 @@ def normalize_disease( try: disease_norm_resp = self.disease_query_handler.normalize(query) except Exception as e: - logger.warning(f"Disease Normalizer raised an exception using query " - f"{query}: {e}") + logger.warning( + f"Disease Normalizer raised an exception using query " + f"{query}: {e}" + ) else: if disease_norm_resp.match_type > highest_match: highest_match = disease_norm_resp.match_type - normalized_disease_id = \ + normalized_disease_id = ( disease_norm_resp.disease_descriptor.disease_id + ) if highest_match == 100: break return disease_norm_resp, normalized_disease_id @@ -142,19 +155,23 @@ def normalize_therapy( try: therapy_norm_resp = self.therapy_query_handler.normalize(query) except Exception as e: - logger.warning(f"Therapy Normalizer raised an exception using " - f"query {query}: {e}") + logger.warning( + f"Therapy Normalizer raised an exception using " + f"query {query}: {e}" + ) else: if therapy_norm_resp.match_type > highest_match: highest_match = therapy_norm_resp.match_type - normalized_therapy_id = therapy_norm_resp.therapy_descriptor.therapy_id # noqa: E501 + normalized_therapy_id = ( + therapy_norm_resp.therapy_descriptor.therapy_id + ) if highest_match == 100: break return therapy_norm_resp, normalized_therapy_id @staticmethod def get_regulatory_approval_extension( - therapy_norm_resp: NormalizedTherapy + therapy_norm_resp: NormalizedTherapy, ) -> Optional[Extension]: """Given therapy normalization service response, extract out the regulatory approval extension @@ -164,7 +181,10 @@ def get_regulatory_approval_extension( data if it `regulatory_approval` extensions exists in therapy normalizer """ regulatory_approval_extension = None - tn_resp_exts = therapy_norm_resp.dict().get("therapy_descriptor", {}).get("extensions") or [] # noqa: E501 + tn_resp_exts = ( + therapy_norm_resp.dict().get("therapy_descriptor", {}).get("extensions") + or [] + ) tn_ext = [v for v in tn_resp_exts if v["name"] == "regulatory_approval"] if tn_ext: @@ -172,10 +192,14 @@ def get_regulatory_approval_extension( approval_ratings = ext_value.get("approval_ratings", []) matched_ext_value = None - if any(ar in {ApprovalRating.FDA_PRESCRIPTION, ApprovalRating.FDA_OTC} - for ar in approval_ratings): - if ApprovalRating.FDA_DISCONTINUED not in approval_ratings or \ - ApprovalRating.CHEMBL_4 in approval_ratings: # noqa: E125 + if any( + ar in {ApprovalRating.FDA_PRESCRIPTION, ApprovalRating.FDA_OTC} + for ar in approval_ratings + ): + if ( + ApprovalRating.FDA_DISCONTINUED not in approval_ratings + or ApprovalRating.CHEMBL_4 in approval_ratings + ): matched_ext_value = "FDA" elif ApprovalRating.CHEMBL_4 in approval_ratings: matched_ext_value = "chembl_phase_4" @@ -188,18 +212,23 @@ def get_regulatory_approval_extension( indication_exts = indication.get("extensions", []) for indication_ext in indication_exts: if indication_ext["value"] == matched_ext_value: - matched_indications.append({ - "id": indication["id"], - "type": indication["type"], - "label": indication["label"], - "disease_id": indication["disease_id"] - }) + matched_indications.append( + { + "id": indication["id"], + "type": indication["type"], + "label": indication["label"], + "disease_id": indication["disease_id"], + } + ) regulatory_approval_extension = Extension( name="regulatory_approval", value={ - "approval_rating": "FDA" if matched_ext_value == "FDA" else "ChEMBL", # noqa: E501 - "has_indications": matched_indications - }) + "approval_rating": "FDA" + if matched_ext_value == "FDA" + else "ChEMBL", + "has_indications": matched_indications, + }, + ) return regulatory_approval_extension diff --git a/metakb/query.py b/metakb/query.py index 12da43ae..fa8c5f30 100644 --- a/metakb/query.py +++ b/metakb/query.py @@ -1,24 +1,41 @@ """Module for queries.""" -from typing import Dict, List, Optional, Tuple -import logging import json +import logging from json.decoder import JSONDecodeError +from typing import Dict, List, Optional, Tuple from urllib.parse import quote -from ga4gh.vrsatile.pydantic.vrsatile_models import Extension, Expression +from ga4gh.vrsatile.pydantic.vrsatile_models import Expression, Extension +from neo4j import Record, Session, Transaction from neo4j.graph import Node -from neo4j import Transaction, Session, Record from metakb.database import Graph from metakb.normalizers import VICCNormalizers -from metakb.schemas import SearchService, SourceName, StatementResponse, \ - TherapeuticResponseProposition, VariationDescriptor, \ - ValueObjectDescriptor, GeneDescriptor, Method, \ - Document, SearchIDService, DiagnosticProposition, PrognosticProposition, \ - SearchStatementsService, NestedStatementResponse, PropositionType, \ - Proposition, ServiceMeta, Predicate -from metakb.transform.oncokb import GENE_EXT_CONVERSIONS, VARIATION_EXT_CONVERSIONS, \ - DISEASE_EXT_CONVERSIONS +from metakb.schemas import ( + DiagnosticProposition, + Document, + GeneDescriptor, + Method, + NestedStatementResponse, + Predicate, + PrognosticProposition, + Proposition, + PropositionType, + SearchIDService, + SearchService, + SearchStatementsService, + ServiceMeta, + SourceName, + StatementResponse, + TherapeuticResponseProposition, + ValueObjectDescriptor, + VariationDescriptor, +) +from metakb.transform.oncokb import ( + DISEASE_EXT_CONVERSIONS, + GENE_EXT_CONVERSIONS, + VARIATION_EXT_CONVERSIONS, +) logger = logging.getLogger("metakb.query") logger.setLevel(logging.DEBUG) @@ -32,9 +49,12 @@ class QueryHandler: """Class for handling queries.""" - def __init__(self, uri: str = "", - creds: Tuple[str, str] = ("", ""), - normalizers: VICCNormalizers = VICCNormalizers()) -> None: + def __init__( + self, + uri: str = "", + creds: Tuple[str, str] = ("", ""), + normalizers: VICCNormalizers = VICCNormalizers(), + ) -> None: """Initialize neo4j driver and the VICC normalizers. :param str uri: address of Neo4j DB :param Tuple[str, str] credentials: tuple containing username and @@ -44,48 +64,46 @@ def __init__(self, uri: str = "", self.driver = Graph(uri, creds).driver self.vicc_normalizers = normalizers - def get_normalized_therapy(self, therapy: str, - warnings: List[str]) -> Optional[str]: + def get_normalized_therapy( + self, therapy: str, warnings: List[str] + ) -> Optional[str]: """Get normalized therapy concept. :param str therapy: Therapy query :param List[str] warnings: A list of warnings for the search query :return: A normalized therapy concept if it exists """ - _, normalized_therapy_id = \ - self.vicc_normalizers.normalize_therapy([therapy]) + _, normalized_therapy_id = self.vicc_normalizers.normalize_therapy([therapy]) if not normalized_therapy_id: - warnings.append(f"Therapy Normalizer unable to normalize: " - f"{therapy}") + warnings.append(f"Therapy Normalizer unable to normalize: " f"{therapy}") return normalized_therapy_id - def get_normalized_disease(self, disease: str, - warnings: List[str]) -> Optional[str]: + def get_normalized_disease( + self, disease: str, warnings: List[str] + ) -> Optional[str]: """Get normalized disease concept. :param str disease: Disease query :param List[str] warnings: A list of warnings for the search query :return: A normalized disease concept if it exists """ - _, normalized_disease_id = \ - self.vicc_normalizers.normalize_disease([disease]) + _, normalized_disease_id = self.vicc_normalizers.normalize_disease([disease]) if not normalized_disease_id: - warnings.append(f"Disease Normalizer unable to normalize: " - f"{disease}") + warnings.append(f"Disease Normalizer unable to normalize: " f"{disease}") return normalized_disease_id - async def get_normalized_variation(self, variation: str, - warnings: List[str]) -> Optional[str]: + async def get_normalized_variation( + self, variation: str, warnings: List[str] + ) -> Optional[str]: """Get normalized variation concept. :param str variation: Variation query :param List[str] warnings: A list of warnings for the search query :return: A normalized variant concept if it exists """ - variant_norm_resp = \ - await self.vicc_normalizers.normalize_variation([variation]) + variant_norm_resp = await self.vicc_normalizers.normalize_variation([variation]) normalized_variation = None if variant_norm_resp: normalized_variation = variant_norm_resp.variation_id @@ -94,12 +112,12 @@ async def get_normalized_variation(self, variation: str, if variation.startswith(("ga4gh:VA.", "ga4gh:CNV.", "ga4gh:VH.")): normalized_variation = variation else: - warnings.append(f"Variant Normalizer unable to normalize: " - f"{variation}") + warnings.append( + f"Variant Normalizer unable to normalize: " f"{variation}" + ) return normalized_variation - def get_normalized_gene(self, gene: str, - warnings: List[str]) -> Optional[str]: + def get_normalized_gene(self, gene: str, warnings: List[str]) -> Optional[str]: """Get normalized gene concept. :param str gene: Gene query @@ -112,9 +130,13 @@ def get_normalized_gene(self, gene: str, return normalized_gene_id async def get_normalized_terms( - self, variation: Optional[str], disease: Optional[str], - therapy: Optional[str], gene: Optional[str], - statement_id: Optional[str], response: Dict + self, + variation: Optional[str], + disease: Optional[str], + therapy: Optional[str], + gene: Optional[str], + statement_id: Optional[str], + response: Dict, ) -> Optional[Tuple]: """Find normalized terms for queried concepts. @@ -133,28 +155,28 @@ async def get_normalized_terms( # Find normalized terms using VICC normalizers if therapy: response["query"]["therapy"] = therapy - normalized_therapy = \ - self.get_normalized_therapy(therapy.strip(), - response["warnings"]) + normalized_therapy = self.get_normalized_therapy( + therapy.strip(), response["warnings"] + ) else: normalized_therapy = None if disease: response["query"]["disease"] = disease - normalized_disease = \ - self.get_normalized_disease(disease.strip(), - response["warnings"]) + normalized_disease = self.get_normalized_disease( + disease.strip(), response["warnings"] + ) else: normalized_disease = None if variation: response["query"]["variation"] = variation - normalized_variation = \ - await self.get_normalized_variation(variation, response["warnings"]) + normalized_variation = await self.get_normalized_variation( + variation, response["warnings"] + ) else: normalized_variation = None if gene: response["query"]["gene"] = gene - normalized_gene = self.get_normalized_gene(gene, - response["warnings"]) + normalized_gene = self.get_normalized_gene(gene, response["warnings"]) else: normalized_gene = None @@ -171,23 +193,36 @@ async def get_normalized_terms( valid_statement_id = statement.get("id") else: response["warnings"].append( - f"Statement: {statement_id} does not exist.") + f"Statement: {statement_id} does not exist." + ) # If queried concept is given check that it is normalized / valid - if (variation and not normalized_variation) or \ - (therapy and not normalized_therapy) or \ - (disease and not normalized_disease) or \ - (gene and not normalized_gene) or \ - (statement_id and not valid_statement_id): + if ( + (variation and not normalized_variation) + or (therapy and not normalized_therapy) + or (disease and not normalized_disease) + or (gene and not normalized_gene) + or (statement_id and not valid_statement_id) + ): return None - return (normalized_variation, normalized_disease, normalized_therapy, - normalized_gene, statement, valid_statement_id) + return ( + normalized_variation, + normalized_disease, + normalized_therapy, + normalized_gene, + statement, + valid_statement_id, + ) async def search( - self, variation: Optional[str] = None, disease: Optional[str] = None, - therapy: Optional[str] = None, gene: Optional[str] = None, - statement_id: Optional[str] = None, detail: bool = False + self, + variation: Optional[str] = None, + disease: Optional[str] = None, + therapy: Optional[str] = None, + gene: Optional[str] = None, + statement_id: Optional[str] = None, + detail: bool = False, ) -> Dict: """Get statements and propositions from queried concepts. @@ -208,13 +243,10 @@ async def search( "therapy": None, "gene": None, "statement_id": None, - "detail": detail + "detail": detail, }, "warnings": [], - "matches": { - "statements": [], - "propositions": [] - }, + "matches": {"statements": [], "propositions": []}, "statements": [], # All Statements "propositions": [], # All propositions "variation_descriptors": [], @@ -223,21 +255,31 @@ async def search( "disease_descriptors": [], "methods": [], "documents": [], - "service_meta_": ServiceMeta().dict() + "service_meta_": ServiceMeta().dict(), } normalized_terms = await self.get_normalized_terms( - variation, disease, therapy, gene, statement_id, response) + variation, disease, therapy, gene, statement_id, response + ) if normalized_terms is None: return SearchService(**response).dict() - (normalized_variation, normalized_disease, - normalized_therapy, normalized_gene, statement, - valid_statement_id) = normalized_terms + ( + normalized_variation, + normalized_disease, + normalized_therapy, + normalized_gene, + statement, + valid_statement_id, + ) = normalized_terms session = self.driver.session() proposition_nodes = session.read_transaction( - self._get_propositions, valid_statement_id, normalized_variation, - normalized_therapy, normalized_disease, normalized_gene, + self._get_propositions, + valid_statement_id, + normalized_variation, + normalized_therapy, + normalized_disease, + normalized_gene, ) if not valid_statement_id: @@ -275,23 +317,20 @@ async def search( ) if proposition_nodes and statement_nodes: - response["statements"] = \ - self.get_statement_response(statement_nodes) - response["propositions"] = \ - self.get_propositions_response(proposition_nodes) + response["statements"] = self.get_statement_response(statement_nodes) + response["propositions"] = self.get_propositions_response(proposition_nodes) else: - response["warnings"].append("Could not find statements " - "associated with the queried" - " concepts.") + response["warnings"].append( + "Could not find statements " "associated with the queried" " concepts." + ) if detail: for s in response["statements"]: vd = self._get_variation_descriptor( response, session.read_transaction( - self._find_node_by_id, - s["variation_descriptor"] - ) + self._find_node_by_id, s["variation_descriptor"] + ), ) if vd not in response["variation_descriptors"]: response["variation_descriptors"].append(vd) @@ -315,9 +354,7 @@ async def search( response["disease_descriptors"].append(dd) m = self._get_method( - session.read_transaction( - self._find_node_by_id, s["method"] - ) + session.read_transaction(self._find_node_by_id, s["method"]) ) if m not in response["methods"]: response["methods"].append(m) @@ -328,9 +365,7 @@ async def search( for sb_id in s["supported_by"]: try: document = self._get_document( - session.read_transaction( - self._find_node_by_id, sb_id - ) + session.read_transaction(self._find_node_by_id, sb_id) ) if document: if document not in response["documents"]: @@ -338,10 +373,12 @@ async def search( except ValueError: sb_not_found.add(sb_id) if sb_not_found: - response["warnings"].append(f"Supported by evidence not " - f"yet supported in MetaKB: " - f"{sb_not_found} for " - f"{s['id']}") + response["warnings"].append( + f"Supported by evidence not " + f"yet supported in MetaKB: " + f"{sb_not_found} for " + f"{s['id']}" + ) else: response["variation_descriptors"] = None response["gene_descriptors"] = None @@ -363,7 +400,7 @@ def search_by_id(self, node_id: str) -> Dict: response = { "query": node_id, "warnings": [], - "service_meta_": ServiceMeta().dict() + "service_meta_": ServiceMeta().dict(), } if not node_id: @@ -374,20 +411,14 @@ def search_by_id(self, node_id: str) -> Dict: node_id = node_id.strip() if "%" not in node_id and ":" in node_id: concept_name = quote(node_id.split(":", 1)[1]) - node_id = \ - f"{node_id.split(':', 1)[0]}" \ - f":{concept_name}" + node_id = f"{node_id.split(':', 1)[0]}" f":{concept_name}" with self.driver.session() as session: - node = session.read_transaction( - self._find_node_by_id, node_id - ) + node = session.read_transaction(self._find_node_by_id, node_id) if node: valid_node_id = node.get("id") else: - response["warnings"].append(f"Node: {node_id} " - f"does not exist.") - if (not node_id and not valid_node_id) or \ - (node_id and not valid_node_id): + response["warnings"].append(f"Node: {node_id} " f"does not exist.") + if (not node_id and not valid_node_id) or (node_id and not valid_node_id): return SearchIDService(**response).dict(exclude_none=True) label, *_ = node.labels @@ -395,22 +426,27 @@ def search_by_id(self, node_id: str) -> Dict: statement = self._get_statement(node) if statement: response["statement"] = statement - elif label in ["Proposition", "TherapeuticResponse", - "Prognostic", "Diagnostic"]: + elif label in [ + "Proposition", + "TherapeuticResponse", + "Prognostic", + "Diagnostic", + ]: proposition = self._get_proposition(node) if proposition: response["proposition"] = proposition elif label == "VariationDescriptor": - response["variation_descriptor"] = \ - self._get_variation_descriptor(response, node) + response["variation_descriptor"] = self._get_variation_descriptor( + response, node + ) elif label == "TherapyDescriptor": - response["therapy_descriptor"] = \ - self._get_therapy_descriptor(node) + response["therapy_descriptor"] = self._get_therapy_descriptor(node) elif label == "DiseaseDescriptor": response["disease_descriptor"] = self._get_disease_descriptor(node) elif label == "GeneDescriptor": - response["gene_descriptor"] = \ - self._get_gene_descriptor(node, self._get_gene_value_object(node)) # noqa: E501 + response["gene_descriptor"] = self._get_gene_descriptor( + node, self._get_gene_value_object(node) + ) elif label == "Document": document = self._get_document(node) if document: @@ -419,13 +455,15 @@ def search_by_id(self, node_id: str) -> Dict: response["method"] = self._get_method(node) session.close() - return SearchIDService(**response).dict( - by_alias=True, exclude_none=True) + return SearchIDService(**response).dict(by_alias=True, exclude_none=True) async def search_statements( - self, variation: Optional[str] = None, - disease: Optional[str] = None, therapy: Optional[str] = None, - gene: Optional[str] = None, statement_id: Optional[str] = None + self, + variation: Optional[str] = None, + disease: Optional[str] = None, + therapy: Optional[str] = None, + gene: Optional[str] = None, + statement_id: Optional[str] = None, ) -> Dict: """Get nested statements from queried concepts @@ -443,30 +481,37 @@ async def search_statements( "disease": None, "therapy": None, "gene": None, - "statement_id": None + "statement_id": None, }, "warnings": [], - "matches": { - "statements": [], - "propositions": [] - }, + "matches": {"statements": [], "propositions": []}, "statements": [], - "service_meta_": ServiceMeta().dict() + "service_meta_": ServiceMeta().dict(), } normalized_terms = await self.get_normalized_terms( - variation, disease, therapy, gene, statement_id, response) + variation, disease, therapy, gene, statement_id, response + ) if normalized_terms is None: return SearchStatementsService(**response).dict() - (normalized_variation, normalized_disease, - normalized_therapy, normalized_gene, statement, - valid_statement_id) = normalized_terms + ( + normalized_variation, + normalized_disease, + normalized_therapy, + normalized_gene, + statement, + valid_statement_id, + ) = normalized_terms session = self.driver.session() statement_nodes = list() proposition_nodes = session.read_transaction( - self._get_propositions, valid_statement_id, normalized_variation, - normalized_therapy, normalized_disease, normalized_gene + self._get_propositions, + valid_statement_id, + normalized_variation, + normalized_therapy, + normalized_disease, + normalized_gene, ) proposition_cache = dict() @@ -477,8 +522,7 @@ async def search_statements( p_id = p_node.get("id") if p_id not in response["matches"]["propositions"]: response["matches"]["propositions"].append(p_id) - self._add_to_proposition_cache( - session, p_node, proposition_cache) + self._add_to_proposition_cache(session, p_node, proposition_cache) statements = session.read_transaction( self._get_statements_from_proposition, p_node.get("id") ) @@ -509,8 +553,7 @@ async def search_statements( if og_prop_nodes_len != len(proposition_nodes): for p_node in proposition_nodes: - self._add_to_proposition_cache( - session, p_node, proposition_cache) + self._add_to_proposition_cache(session, p_node, proposition_cache) methods_cache: Dict = dict() variations_cache: Dict = dict() @@ -543,9 +586,8 @@ async def search_statements( else: variation_descr = self._get_variation_descriptor( {}, - session.read_transaction( - self._find_node_by_id, variation_id), - gene_context_by_id=False + session.read_transaction(self._find_node_by_id, variation_id), + gene_context_by_id=False, ) variations_cache[variation_id] = variation_descr @@ -555,8 +597,7 @@ async def search_statements( therapy_descr = therapy_cache[therapy_id] else: therapy_descr = self._get_therapy_descriptor( - session.read_transaction(self._find_node_by_id, - therapy_id) + session.read_transaction(self._find_node_by_id, therapy_id) ) therapy_cache[therapy_id] = therapy_descr else: @@ -567,16 +608,13 @@ async def search_statements( disease_descr = disease_cache[disease_id] else: disease_descr = self._get_disease_descriptor( - session.read_transaction(self._find_node_by_id, - disease_id) + session.read_transaction(self._find_node_by_id, disease_id) ) disease_cache[disease_id] = disease_descr supported_by = list() sb_not_found = set() - sb_list = session.read_transaction( - self._find_and_return_supported_by, s_id - ) + sb_list = session.read_transaction(self._find_and_return_supported_by, s_id) for sb in sb_list: sb_id = sb.get("id") try: @@ -584,9 +622,7 @@ async def search_statements( document = document_cache[sb_id] else: document = self._get_document( - session.read_transaction( - self._find_node_by_id, sb_id - ) + session.read_transaction(self._find_node_by_id, sb_id) ) if document: @@ -598,10 +634,12 @@ async def search_statements( except ValueError: sb_not_found.add(sb_id) if sb_not_found: - response["warnings"].append(f"Supported by evidence not " - f"yet supported in MetaKB: " - f"{sb_not_found} for " - f"{s['id']}") + response["warnings"].append( + f"Supported by evidence not " + f"yet supported in MetaKB: " + f"{sb_not_found} for " + f"{s['id']}" + ) params = { "id": s_id, @@ -614,17 +652,18 @@ async def search_statements( "therapy_descriptor": therapy_descr, "disease_descriptor": disease_descr, "method": method, - "supported_by": supported_by + "supported_by": supported_by, } - response["statements"].append( - NestedStatementResponse(**params).dict()) + response["statements"].append(NestedStatementResponse(**params).dict()) added_statements.add(s_id) session.close() return SearchStatementsService(**response).dict( - by_alias=True, exclude_none=True) + by_alias=True, exclude_none=True + ) - def _add_to_proposition_cache(self, session: Session, p_node: Node, - proposition_cache: Dict) -> None: + def _add_to_proposition_cache( + self, session: Session, p_node: Node, proposition_cache: Dict + ) -> None: """Add a proposition to `proposition_cache` :param Session session: Neo4j driver session @@ -634,8 +673,7 @@ def _add_to_proposition_cache(self, session: Session, p_node: Node, p_id = p_node.get("id") if p_id not in proposition_cache: proposition_resp = session.read_transaction( - self._find_and_return_proposition_response, - p_id + self._find_and_return_proposition_response, p_id ) proposition_type = p_node.get("type") proposition = { @@ -643,25 +681,28 @@ def _add_to_proposition_cache(self, session: Session, p_node: Node, "type": proposition_type, "predicate": p_node.get("predicate"), "subject": proposition_resp["subject"], - "object_qualifier": proposition_resp["object_qualifier"] + "object_qualifier": proposition_resp["object_qualifier"], } if proposition_type == PropositionType.PREDICTIVE: proposition["object"] = proposition_resp["object"] - proposition = \ - TherapeuticResponseProposition(**proposition) + proposition = TherapeuticResponseProposition(**proposition) elif proposition_type == PropositionType.PROGNOSTIC: proposition = PrognosticProposition(**proposition) elif proposition_type == PropositionType.DIAGNOSTIC: proposition = DiagnosticProposition(**proposition) else: - raise ValueError(f"{proposition_type} is not a valid " - f"proposition type") + raise ValueError( + f"{proposition_type} is not a valid " f"proposition type" + ) if proposition: proposition_cache[p_id] = proposition def _get_variation_descriptor( - self, response: Dict, variation_descriptor: Node, - gene_context_by_id: bool = True) -> VariationDescriptor: + self, + response: Dict, + variation_descriptor: Node, + gene_context_by_id: bool = True, + ) -> VariationDescriptor: """Get variation descriptor :param Dict response: Query response object @@ -682,7 +723,7 @@ def _get_variation_descriptor( "gene_context": None, "molecule_context": variation_descriptor.get("molecule_context"), "structural_type": variation_descriptor.get("structural_type"), - "vrs_ref_allele_seq": variation_descriptor.get("vrs_ref_allele_seq"), # noqa: E501 + "vrs_ref_allele_seq": variation_descriptor.get("vrs_ref_allele_seq"), "expressions": [], "xrefs": variation_descriptor.get("xrefs"), "alternate_labels": variation_descriptor.get("alternate_labels"), @@ -696,11 +737,9 @@ def _get_variation_descriptor( gene_descriptor_id = gene_descriptor.get("id") gene_value_object = session.read_transaction( - self._find_descriptor_value_object, - gene_descriptor_id + self._find_descriptor_value_object, gene_descriptor_id ) - gene_context = self._get_gene_descriptor( - gene_descriptor, gene_value_object) + gene_context = self._get_gene_descriptor(gene_descriptor, gene_value_object) if gene_context_by_id: # Reference gene descriptor by id @@ -709,19 +748,19 @@ def _get_variation_descriptor( # gene context will be gene descriptor vd_params["gene_context"] = gene_context - if "gene_descriptors" in response and\ - gene_descriptor_id not in response["gene_descriptors"]: + if ( + "gene_descriptors" in response + and gene_descriptor_id not in response["gene_descriptors"] + ): response["gene_descriptors"].append(gene_context) # Get Variation Descriptor Expressions - for key in ["expressions_g", "expressions_p", - "expressions_c"]: + for key in ["expressions_g", "expressions_p", "expressions_c"]: if key in keys: for value in variation_descriptor.get(key): vd_params["expressions"].append( Expression( - syntax=f"hgvs.{key.split('_')[-1]}", - value=value + syntax=f"hgvs.{key.split('_')[-1]}", value=value ).dict() ) if not vd_params["expressions"]: @@ -730,29 +769,31 @@ def _get_variation_descriptor( extensions = [] # Get Variation Descriptor Extensions if vd_params["id"].startswith("civic.vid"): - for field in ["civic_representative_coordinate", - "civic_actionability_score"]: + for field in [ + "civic_representative_coordinate", + "civic_actionability_score", + ]: if field in keys: extensions.append( Extension( name=field, - value=json.loads(variation_descriptor.get(field)) + value=json.loads(variation_descriptor.get(field)), ).dict() ) with self.driver.session() as session: - variant_group = session.read_transaction( - self._get_variation_group, vid - ) + variant_group = session.read_transaction(self._get_variation_group, vid) if variant_group: variant_group = variant_group[0] vg = Extension( name="variant_group", - value=[{ - "id": variant_group.get("id"), - "label": variant_group.get("label"), - "description": variant_group.get("description"), - "type": "variant_group" - }] + value=[ + { + "id": variant_group.get("id"), + "label": variant_group.get("label"), + "description": variant_group.get("description"), + "type": "variant_group", + } + ], ).dict() for v in vg["value"]: if not v["description"]: @@ -764,7 +805,7 @@ def _get_variation_descriptor( extensions.append( Extension( name=field, - value=json.loads(variation_descriptor.get(field)) + value=json.loads(variation_descriptor.get(field)), ).dict() ) elif vd_params["id"].startswith("oncokb.variant"): @@ -772,8 +813,7 @@ def _get_variation_descriptor( if field in keys: extensions.append( Extension( - name=field, - value=json.loads(variation_descriptor[field]) + name=field, value=json.loads(variation_descriptor[field]) ).dict() ) @@ -797,15 +837,14 @@ def _get_variation_group(tx: Transaction, vid: str) -> Optional[Record]: :return: query record, containing variation group node if successful """ query = ( - "MATCH (vd:VariationDescriptor)-[:IN_VARIATION_GROUP]->(vg:VariationGroup) " # noqa: E501 + "MATCH (vd:VariationDescriptor)-[:IN_VARIATION_GROUP]->(vg:VariationGroup) " f"WHERE toLower(vd.id) = toLower('{vid}') " "RETURN vg" ) return tx.run(query).single() @staticmethod - def _get_variation_descriptors_gene(tx: Transaction, - vid: str) -> Optional[Node]: + def _get_variation_descriptors_gene(tx: Transaction, vid: str) -> Optional[Node]: """Get a Variation Descriptor's Gene Descriptor. :param Transaction tx: Neo4j session transaction :param str vid: variation descriptor ID @@ -819,8 +858,9 @@ def _get_variation_descriptors_gene(tx: Transaction, return tx.run(query).single()[0] @staticmethod - def _get_gene_descriptor(gene_descriptor: Node, - gene_value_object: Node) -> GeneDescriptor: + def _get_gene_descriptor( + gene_descriptor: Node, gene_value_object: Node + ) -> GeneDescriptor: """Add gene descriptor to response. :param Node gene_descriptor: Gene Descriptor Node @@ -844,8 +884,7 @@ def _get_gene_descriptor(gene_descriptor: Node, if field in keys: extensions.append( Extension( - name=field, - value=json.loads(gene_descriptor[field]) + name=field, value=json.loads(gene_descriptor[field]) ).dict() ) @@ -869,7 +908,7 @@ def _get_therapy_descriptor( "therapy_id": None, "alternate_labels": therapy_descriptor.get("alternate_labels"), "xrefs": therapy_descriptor.get("xrefs"), - "extensions": [] + "extensions": [], } key = "regulatory_approval" @@ -901,7 +940,7 @@ def _get_disease_descriptor( "label": disease_descriptor.get("label"), "disease_id": None, "xrefs": disease_descriptor.get("xrefs"), - "extensions": [] + "extensions": [], } keys = disease_descriptor.keys() @@ -910,8 +949,7 @@ def _get_disease_descriptor( if field in keys: dd_params["extensions"].append( Extension( - name=field, - value=json.loads(disease_descriptor[field]) + name=field, value=json.loads(disease_descriptor[field]) ).dict() ) @@ -965,16 +1003,13 @@ def _find_node_by_id(tx: Transaction, node_id: str) -> Optional[Node]: :param str node_id: ID of node to retrieve :return: Node object if successful """ - query = ( - "MATCH (n) " - f"WHERE toLower(n.id) = toLower('{node_id}') " - "RETURN n" - ) + query = "MATCH (n) " f"WHERE toLower(n.id) = toLower('{node_id}') " "RETURN n" return (tx.run(query).single() or [None])[0] @staticmethod - def _find_descriptor_value_object(tx: Transaction, - descriptor_id: str) -> Optional[Node]: + def _find_descriptor_value_object( + tx: Transaction, descriptor_id: str + ) -> Optional[Node]: """Find a Descriptor's value object. :param Transaction tx: Neo4j session transaction object :param str descriptor_id: ID of descriptor to look up @@ -988,8 +1023,8 @@ def _find_descriptor_value_object(tx: Transaction, return tx.run(query).single()[0] def add_proposition_and_statement_nodes( - self, session, statement_id: str, proposition_nodes: List, - statement_nodes: List): + self, session, statement_id: str, proposition_nodes: List, statement_nodes: List + ): """Get statements found in `supported_by` and their propositions and add to corresponding list. @@ -999,23 +1034,19 @@ def add_proposition_and_statement_nodes( :param List statement_nodes: List of statements """ supported_by_statements = session.read_transaction( - self._find_and_return_supported_by, statement_id, - only_statement=True + self._find_and_return_supported_by, statement_id, only_statement=True ) for s in supported_by_statements: if s not in statement_nodes: statement_nodes.append(s) proposition = session.read_transaction( - self._find_and_return_propositions_from_statement, - s.get("id") + self._find_and_return_propositions_from_statement, s.get("id") ) - if proposition and proposition \ - not in proposition_nodes: + if proposition and proposition not in proposition_nodes: proposition_nodes.append(proposition) @staticmethod - def _get_statement_by_id(tx: Transaction, - statement_id: str) -> Optional[Node]: + def _get_statement_by_id(tx: Transaction, statement_id: str) -> Optional[Node]: """Get a Statement node by ID. :param Transaction tx: Neo4j session transaction object @@ -1031,14 +1062,14 @@ def _get_statement_by_id(tx: Transaction, @staticmethod def _get_propositions( - tx: Transaction, - statement_id: str = "", - normalized_variation: str = "", - normalized_therapy: str = "", - normalized_disease: str = "", - normalized_gene: str = "", - prop_type: Optional[PropositionType] = None, - pred: Optional[Predicate] = None + tx: Transaction, + statement_id: str = "", + normalized_variation: str = "", + normalized_therapy: str = "", + normalized_disease: str = "", + normalized_gene: str = "", + prop_type: Optional[PropositionType] = None, + pred: Optional[Predicate] = None, ) -> List[Node]: """Get propositions that contain normalized concepts queried. Used as callback for Neo4j session API. @@ -1056,8 +1087,7 @@ def _get_propositions( query = "" params: Dict[str, str] = {} if prop_type and pred: - query += \ - "MATCH (p:Proposition {type:$prop_type, predicate:$pred}) " + query += "MATCH (p:Proposition {type:$prop_type, predicate:$pred}) " params["prop_type"] = prop_type.value params["pred"] = pred.value elif prop_type: @@ -1067,36 +1097,38 @@ def _get_propositions( query += "MATCH (p:Proposition {predicate:$pred}) " params["pred"] = pred.value if statement_id: - query += "MATCH (:Statement {id:$s_id})-[:DEFINED_BY]-> (p:Proposition) " # noqa: E501 + query += "MATCH (:Statement {id:$s_id})-[:DEFINED_BY]-> (p:Proposition) " params["s_id"] = statement_id if normalized_therapy: - query += \ - "MATCH (p:Proposition)<-[:IS_OBJECT_OF]-(:Therapy {id:$t_id}) " + query += "MATCH (p:Proposition)<-[:IS_OBJECT_OF]-(:Therapy {id:$t_id}) " params["t_id"] = normalized_therapy if normalized_variation: lower_normalized_variation = normalized_variation.lower() query += "MATCH (p:Proposition)<-[:IS_SUBJECT_OF]-(v:Variation " - if lower_normalized_variation.startswith('ga4gh:sq.'): + if lower_normalized_variation.startswith("ga4gh:sq."): # Sequence ID query += "{location_sequence_id: $v_id}) " else: query += "{id:$v_id}) " params["v_id"] = normalized_variation if normalized_disease: - query += "MATCH (p:Proposition)<-[:IS_OBJECT_QUALIFIER_OF]-(:Disease {id:$d_id}) " # noqa: E501 + query += "MATCH (p:Proposition)<-[:IS_OBJECT_QUALIFIER_OF]-(:Disease {id:$d_id}) " params["d_id"] = normalized_disease if normalized_gene: - query += "MATCH (:Gene {id:$g_id})<-[:DESCRIBES]-" \ - "(:GeneDescriptor)<-[:HAS_GENE]-" \ - "(:VariationDescriptor)-[:DESCRIBES]->(v:Variation)-" \ - "[:IS_SUBJECT_OF]->(p:Proposition) " + query += ( + "MATCH (:Gene {id:$g_id})<-[:DESCRIBES]-" + "(:GeneDescriptor)<-[:HAS_GENE]-" + "(:VariationDescriptor)-[:DESCRIBES]->(v:Variation)-" + "[:IS_SUBJECT_OF]->(p:Proposition) " + ) params["g_id"] = normalized_gene query += "RETURN DISTINCT p" return [p[0] for p in tx.run(query, **params)] @staticmethod - def _get_statements_from_proposition(tx: Transaction, - proposition_id: str) -> List[Node]: + def _get_statements_from_proposition( + tx: Transaction, proposition_id: str + ) -> List[Node]: """Get statements that are defined by a proposition. :param Transaction tx: Neo4j session transaction object @@ -1105,13 +1137,12 @@ def _get_statements_from_proposition(tx: Transaction, :return: List of statement Nodes """ query = ( - "MATCH (p:Proposition {id: $proposition_id})<-[:DEFINED_BY]-(s:Statement) " # noqa: E501 + "MATCH (p:Proposition {id: $proposition_id})<-[:DEFINED_BY]-(s:Statement) " "RETURN DISTINCT s" ) return [s[0] for s in tx.run(query, proposition_id=proposition_id)] - def get_statement_response(self, - statement_nodes: List[Node]) -> List[Dict]: + def get_statement_response(self, statement_nodes: List[Node]) -> List[Dict]: """Return a list of statements from Statement and Proposition nodes. :param List statement_nodes: A list of Statement Nodes @@ -1122,9 +1153,7 @@ def get_statement_response(self, for s in statement_nodes: s_id = s.get("id") if s_id not in added_statements: - statements_response.append( - self._get_statement(s) - ) + statements_response.append(self._get_statement(s)) added_statements.add(s_id) return statements_response @@ -1141,7 +1170,7 @@ def _find_and_return_statement_response( """ queries = ( ("MATCH (s)-[r1]->(td:TherapyDescriptor) ", "td.id AS tid,"), - ("", "") + ("", ""), ) for q in queries: query = ( @@ -1176,12 +1205,11 @@ def get_propositions_response( return propositions_response @staticmethod - def _find_and_return_proposition_response(tx: Transaction, - proposition_id: str) -> Record: + def _find_and_return_proposition_response( + tx: Transaction, proposition_id: str + ) -> Record: """Return value ids from a proposition.""" - queries = ( - ("MATCH (n) -[r1]-> (t:Therapy) ", "t.id AS object,"), ("", "") - ) + queries = (("MATCH (n) -[r1]-> (t:Therapy) ", "t.id AS object,"), ("", "")) for q in queries: query = ( f"MATCH (n) " @@ -1213,11 +1241,7 @@ def _find_and_return_supported_by( match = "MATCH (s:Statement)-[:CITES]->(sb) " else: match = "MATCH (s:Statement)-[:CITES]->(sb:Statement) " - query = ( - f"{match}" - f"WHERE s.id = '{statement_id}' " - "RETURN sb" - ) + query = f"{match}" f"WHERE s.id = '{statement_id}' " "RETURN sb" return [se[0] for se in tx.run(query)] @staticmethod @@ -1268,12 +1292,11 @@ def _get_proposition(self, p: Node) -> Proposition: "type": p_type, "predicate": p.get("predicate"), "subject": value_ids["subject"], - "object_qualifier": value_ids["object_qualifier"] + "object_qualifier": value_ids["object_qualifier"], } if p_type == PropositionType.PREDICTIVE: params["object"] = value_ids["object"] - proposition = \ - TherapeuticResponseProposition(**params) + proposition = TherapeuticResponseProposition(**params) elif p_type == PropositionType.PROGNOSTIC: proposition = PrognosticProposition(**params) elif p_type == PropositionType.DIAGNOSTIC: @@ -1291,9 +1314,11 @@ def _get_statement(self, s: Node) -> Dict: with self.driver.session() as session: statement_id = s.get("id") response = session.read_transaction( - self._find_and_return_statement_response, statement_id) + self._find_and_return_statement_response, statement_id + ) se_list = session.read_transaction( - self._find_and_return_supported_by, statement_id) + self._find_and_return_supported_by, statement_id + ) extensions = [] # Right now, only OncoKB has Statement extensions @@ -1302,7 +1327,7 @@ def _get_statement(self, s: Node) -> Dict: extensions.append( Extension( name="onckb_fda_level", - value=json.loads(s["onckb_fda_level"]) + value=json.loads(s["onckb_fda_level"]), ).dict() ) @@ -1314,11 +1339,13 @@ def _get_statement(self, s: Node) -> Dict: variation_origin=s.get("variation_origin"), proposition=response["p_id"], variation_descriptor=response["vid"], - therapy_descriptor=response["tid"] if "tid" in response.keys() else None, # noqa: E501 + therapy_descriptor=response["tid"] + if "tid" in response.keys() + else None, disease_descriptor=response["did"], method=response["m"]["id"], supported_by=[se["id"] for se in se_list], - extensions=extensions if extensions else None + extensions=extensions if extensions else None, ).dict(exclude_none=True) return statement diff --git a/metakb/schemas.py b/metakb/schemas.py index 40b69e7d..7580a7e1 100644 --- a/metakb/schemas.py +++ b/metakb/schemas.py @@ -1,10 +1,14 @@ """Common data model""" from enum import Enum, IntEnum -from typing import List, Optional, Union, Dict, Any, Type +from typing import Any, Dict, List, Optional, Type, Union from ga4gh.vrsatile.pydantic.vrs_models import CURIE -from ga4gh.vrsatile.pydantic.vrsatile_models import ValueObjectDescriptor, \ - GeneDescriptor, VariationDescriptor, Extension +from ga4gh.vrsatile.pydantic.vrsatile_models import ( + Extension, + GeneDescriptor, + ValueObjectDescriptor, + VariationDescriptor, +) from pydantic import BaseModel from pydantic.types import StrictBool @@ -22,103 +26,107 @@ class SourceName(str, Enum): class XrefSystem(str, Enum): """Define constraints for System in xrefs.""" - CLINVAR = 'clinvar' - CLINGEN = 'caid' - DB_SNP = 'dbsnp' - NCBI = 'ncbigene' - DISEASE_ONTOLOGY = 'do' + CLINVAR = "clinvar" + CLINGEN = "caid" + DB_SNP = "dbsnp" + NCBI = "ncbigene" + DISEASE_ONTOLOGY = "do" class SourcePrefix(str, Enum): """Define constraints for source prefixes.""" - PUBMED = 'pmid' - ASCO = 'asco' + PUBMED = "pmid" + ASCO = "asco" class NormalizerPrefix(str, Enum): """Define constraints for normalizer prefixes.""" - GENE = 'gene' + GENE = "gene" class PropositionType(str, Enum): """Define constraints for proposition type.""" - PREDICTIVE = 'therapeutic_response_proposition' - DIAGNOSTIC = 'diagnostic_proposition' - PROGNOSTIC = 'prognostic_proposition' - PREDISPOSING = 'predisposition_proposition' - FUNCTIONAL = 'functional_consequence_proposition' - ONCOGENIC = 'oncogenicity_proposition' - PATHOGENIC = 'pathogenicity_proposition' + PREDICTIVE = "therapeutic_response_proposition" + DIAGNOSTIC = "diagnostic_proposition" + PROGNOSTIC = "prognostic_proposition" + PREDISPOSING = "predisposition_proposition" + FUNCTIONAL = "functional_consequence_proposition" + ONCOGENIC = "oncogenicity_proposition" + PATHOGENIC = "pathogenicity_proposition" class PredictivePredicate(str, Enum): """Define constraints for predictive predicate.""" - SENSITIVITY = 'predicts_sensitivity_to' - RESISTANCE = 'predicts_resistance_to' + SENSITIVITY = "predicts_sensitivity_to" + RESISTANCE = "predicts_resistance_to" class DiagnosticPredicate(str, Enum): """Define constraints for diagnostic predicate.""" - POSITIVE = 'is_diagnostic_inclusion_criterion_for' - NEGATIVE = 'is_diagnostic_exclusion_criterion_for' + POSITIVE = "is_diagnostic_inclusion_criterion_for" + NEGATIVE = "is_diagnostic_exclusion_criterion_for" class PrognosticPredicate(str, Enum): """Define constraints for prognostic predicate.""" - BETTER_OUTCOME = 'is_prognostic_of_better_outcome_for' - POOR_OUTCOME = 'is_prognostic_of_worse_outcome_for' + BETTER_OUTCOME = "is_prognostic_of_better_outcome_for" + POOR_OUTCOME = "is_prognostic_of_worse_outcome_for" class PathogenicPredicate(str, Enum): """Define constraints for the pathogenicity predicate.""" - UNCERTAIN_SIGNIFICANCE = 'is_of_uncertain_significance_for' - PATHOGENIC = 'is_pathogenic_for' - BENIGN = 'is_benign_for' + UNCERTAIN_SIGNIFICANCE = "is_of_uncertain_significance_for" + PATHOGENIC = "is_pathogenic_for" + BENIGN = "is_benign_for" class FunctionalPredicate(str, Enum): """Define constraints for functional predicate.""" - GAIN_OF_FUNCTION = 'causes_gain_of_function_of' - LOSS_OF_FUNCTION = 'causes_loss_of_function_of' - UNALTERED_FUNCTION = 'does_not_change_function_of' - NEOMORPHIC = 'causes_neomorphic_function_of' - DOMINATE_NEGATIVE = 'causes_dominant_negative_function_of' + GAIN_OF_FUNCTION = "causes_gain_of_function_of" + LOSS_OF_FUNCTION = "causes_loss_of_function_of" + UNALTERED_FUNCTION = "does_not_change_function_of" + NEOMORPHIC = "causes_neomorphic_function_of" + DOMINATE_NEGATIVE = "causes_dominant_negative_function_of" -Predicate = Union[PredictivePredicate, DiagnosticPredicate, - PrognosticPredicate, PathogenicPredicate, - FunctionalPredicate] +Predicate = Union[ + PredictivePredicate, + DiagnosticPredicate, + PrognosticPredicate, + PathogenicPredicate, + FunctionalPredicate, +] class VariationOrigin(str, Enum): """Define constraints for variant origin.""" - SOMATIC = 'somatic' - GERMLINE = 'germline' - NOT_APPLICABLE = 'N/A' + SOMATIC = "somatic" + GERMLINE = "germline" + NOT_APPLICABLE = "N/A" class Direction(str, Enum): """Define constraints for evidence direction.""" - SUPPORTS = 'supports' - DOES_NOT_SUPPORT = 'does_not_support' + SUPPORTS = "supports" + DOES_NOT_SUPPORT = "does_not_support" class MoleculeContext(str, Enum): """Define constraints for types of molecule context.""" - GENOMIC = 'genomic' - TRANSCRIPT = 'transcript' - PROTEIN = 'protein' + GENOMIC = "genomic" + TRANSCRIPT = "transcript" + PROTEIN = "protein" class Proposition(BaseModel): @@ -142,21 +150,21 @@ class Config: """Configure examples.""" @staticmethod - def schema_extra(schema: Dict[str, Any], - model: Type['TherapeuticResponseProposition']) \ - -> None: + def schema_extra( + schema: Dict[str, Any], model: Type["TherapeuticResponseProposition"] + ) -> None: """Configure OpenAPI schema""" - if 'title' in schema.keys(): - schema.pop('title', None) - for prop in schema.get('properties', {}).values(): - prop.pop('title', None) - schema['example'] = { + if "title" in schema.keys(): + schema.pop("title", None) + for prop in schema.get("properties", {}).values(): + prop.pop("title", None) + schema["example"] = { "id": "proposition:133", "type": "therapeutic_response_proposition", "predicate": "predicts_sensitivity_to", "subject": "ga4gh:VA.kgjrhgf84CEndyLjKdAO0RxN-e3pJjxA", "object_qualifier": "ncit:C2926", - "object": "rxcui:1430438" + "object": "rxcui:1430438", } @@ -188,7 +196,7 @@ class Statement(BaseModel): """Define Statement model.""" id: CURIE - type = 'Statement' + type = "Statement" description: Optional[str] direction: Optional[Direction] evidence_level: CURIE @@ -211,7 +219,7 @@ class Document(BaseModel): label: str description: Optional[str] xrefs: Optional[List[CURIE]] - type = 'Document' + type = "Document" class Date(BaseModel): @@ -225,38 +233,37 @@ class Config: """Configure examples.""" @staticmethod - def schema_extra(schema: Dict[str, Any], - model: Type['StatementResponse']) -> None: + def schema_extra( + schema: Dict[str, Any], model: Type["StatementResponse"] + ) -> None: """Configure OpenAPI schema""" - if 'title' in schema.keys(): - schema.pop('title', None) - for prop in schema.get('properties', {}).values(): - prop.pop('title', None) - schema['example'] = { - "year": 2019, - "month": 11, - "day": 29 - } + if "title" in schema.keys(): + schema.pop("title", None) + for prop in schema.get("properties", {}).values(): + prop.pop("title", None) + schema["example"] = {"year": 2019, "month": 11, "day": 29} class Method(BaseModel): - """Define model for methods used in evidence curation and classifications.""" # noqa: E501 + """Define model for methods used in evidence curation and classifications.""" id: CURIE label: str url: str version: Date authors: str - type = 'Method' + type = "Method" class Response(BaseModel): """Define the Response Model.""" statements: List[Statement] - propositions: List[Union[TherapeuticResponseProposition, - PrognosticProposition, - DiagnosticProposition]] + propositions: List[ + Union[ + TherapeuticResponseProposition, PrognosticProposition, DiagnosticProposition + ] + ] variation_descriptors: List[VariationDescriptor] gene_descriptors: List[GeneDescriptor] therapy_descriptors: Optional[List[ValueObjectDescriptor]] @@ -269,7 +276,7 @@ class StatementResponse(BaseModel): """Define Statement Response for Search Endpoint.""" id: CURIE - type = 'Statement' + type = "Statement" description: Optional[str] direction: Optional[Direction] evidence_level: CURIE @@ -286,16 +293,17 @@ class Config: """Configure examples.""" @staticmethod - def schema_extra(schema: Dict[str, Any], - model: Type['StatementResponse']) -> None: + def schema_extra( + schema: Dict[str, Any], model: Type["StatementResponse"] + ) -> None: """Configure OpenAPI schema""" - if 'title' in schema.keys(): - schema.pop('title', None) - for prop in schema.get('properties', {}).values(): - prop.pop('title', None) - schema['example'] = { + if "title" in schema.keys(): + schema.pop("title", None) + for prop in schema.get("properties", {}).values(): + prop.pop("title", None) + schema["example"] = { "id": "civic.eid:2997", - "description": "Afatinib, an irreversible inhibitor of the ErbB family of tyrosine kinases has been approved in the US for the first-line treatment of patients with metastatic non-small-cell lung cancer (NSCLC) who have tumours with EGFR exon 19 deletions or exon 21 (L858R) substitution mutations as detected by a US FDA-approved test", # noqa: E501 + "description": "Afatinib, an irreversible inhibitor of the ErbB family of tyrosine kinases has been approved in the US for the first-line treatment of patients with metastatic non-small-cell lung cancer (NSCLC) who have tumours with EGFR exon 19 deletions or exon 21 (L858R) substitution mutations as detected by a US FDA-approved test", "direction": "supports", "evidence_level": "civic.evidence_level:A", "variation_origin": "somatic", @@ -304,10 +312,8 @@ def schema_extra(schema: Dict[str, Any], "therapy_descriptor": "civic.tid:146", "disease_descriptor": "civic.did:8", "method": "method:001", - "supported_by": [ - "pmid:23982599" - ], - "type": "Statement" + "supported_by": ["pmid:23982599"], + "type": "Statement", } @@ -315,14 +321,14 @@ class NestedStatementResponse(BaseModel): """Define Statement Response for Search Endpoint.""" id: CURIE - type = 'Statement' + type = "Statement" description: Optional[str] direction: Optional[Direction] evidence_level: CURIE variation_origin: Optional[VariationOrigin] - proposition: Union[TherapeuticResponseProposition, - PrognosticProposition, - DiagnosticProposition] + proposition: Union[ + TherapeuticResponseProposition, PrognosticProposition, DiagnosticProposition + ] variation_descriptor: VariationDescriptor therapy_descriptor: Optional[ValueObjectDescriptor] disease_descriptor: ValueObjectDescriptor @@ -334,16 +340,17 @@ class Config: """Configure examples.""" @staticmethod - def schema_extra(schema: Dict[str, Any], - model: Type['NestedStatementResponse']) -> None: + def schema_extra( + schema: Dict[str, Any], model: Type["NestedStatementResponse"] + ) -> None: """Configure OpenAPI schema""" - if 'title' in schema.keys(): - schema.pop('title', None) - for prop in schema.get('properties', {}).values(): - prop.pop('title', None) - schema['example'] = { + if "title" in schema.keys(): + schema.pop("title", None) + for prop in schema.get("properties", {}).values(): + prop.pop("title", None) + schema["example"] = { "id": "civic.eid:2997", - "description": "Afatinib, an irreversible inhibitor of the ErbB family of tyrosine kinases has been approved in the US for the first-line treatment of patients with metastatic non-small-cell lung cancer (NSCLC) who have tumours with EGFR exon 19 deletions or exon 21 (L858R) substitution mutations as detected by a US FDA-approved test", # noqa: E501 + "description": "Afatinib, an irreversible inhibitor of the ErbB family of tyrosine kinases has been approved in the US for the first-line treatment of patients with metastatic non-small-cell lung cancer (NSCLC) who have tumours with EGFR exon 19 deletions or exon 21 (L858R) substitution mutations as detected by a US FDA-approved test", "direction": "supports", "evidence_level": "civic.evidence_level:A", "variation_origin": "somatic", @@ -353,7 +360,7 @@ def schema_extra(schema: Dict[str, Any], "predicate": "predicts_sensitivity_to", "subject": "ga4gh:VA.kgjrhgf84CEndyLjKdAO0RxN-e3pJjxA", "object_qualifier": "ncit:C2926", - "object": "rxcui:1430438" + "object": "rxcui:1430438", }, "variation_descriptor": { "id": "civic.vid:33", @@ -364,11 +371,9 @@ def schema_extra(schema: Dict[str, Any], "clinvar:16609", "clinvar:376282", "caid:CA126713", - "dbsnp:121434568" - ], - "alternate_labels": [ - "LEU858ARG" + "dbsnp:121434568", ], + "alternate_labels": ["LEU858ARG"], "extensions": [ { "type": "Extension", @@ -379,72 +384,58 @@ def schema_extra(schema: Dict[str, Any], "stop": 55259515, "reference_bases": "T", "variant_bases": "G", - "representative_transcript": - "ENST00000275493.2", + "representative_transcript": "ENST00000275493.2", "ensembl_version": 75, "reference_build": "GRCh37", - "type": "coordinates" - } + "type": "coordinates", + }, } ], - "variation_id": - "ga4gh:VA.kgjrhgf84CEndyLjKdAO0RxN-e3pJjxA", + "variation_id": "ga4gh:VA.kgjrhgf84CEndyLjKdAO0RxN-e3pJjxA", "variation": { "_id": "ga4gh:VA.kgjrhgf84CEndyLjKdAO0RxN-e3pJjxA", "type": "Allele", "location": { - "_id": "ga4gh:VSL.Sfs_3PlVEYp9BxBsHsFfU1tvhfDq361f", # noqa: E501 + "_id": "ga4gh:VSL.Sfs_3PlVEYp9BxBsHsFfU1tvhfDq361f", "type": "SequenceLocation", - "sequence_id": - "ga4gh:SQ.vyo55F6mA6n2LgN4cagcdRzOuh38V4mE", + "sequence_id": "ga4gh:SQ.vyo55F6mA6n2LgN4cagcdRzOuh38V4mE", "interval": { "type": "SequenceInterval", - "start": { - "type": "Number", - "value": 857 - }, - "end": { - "type": "Number", - "value": 858 - } - } + "start": {"type": "Number", "value": 857}, + "end": {"type": "Number", "value": 858}, + }, }, - "state": { - "type": "LiteralSequenceExpression", - "sequence": "R" - } + "state": {"type": "LiteralSequenceExpression", "sequence": "R"}, }, "structural_type": "SO:0001583", "expressions": [ { "type": "Expression", "syntax": "hgvs.g", - "value": "NC_000007.13:g.55259515T>G" + "value": "NC_000007.13:g.55259515T>G", }, { "type": "Expression", "syntax": "hgvs.p", - "value": "NP_005219.2:p.Leu858Arg" + "value": "NP_005219.2:p.Leu858Arg", }, { "type": "Expression", "syntax": "hgvs.c", - "value": "NM_005228.4:c.2573T>G" + "value": "NM_005228.4:c.2573T>G", }, { "type": "Expression", "syntax": "hgvs.c", - "value": "ENST00000275493.2:c.2573T>G" - } + "value": "ENST00000275493.2:c.2573T>G", + }, ], "gene_context": { "id": "civic.gid:19", "type": "GeneDescriptor", "label": "EGFR", - "description": "EGFR is widely recognized for its importance in cancer. Amplification and mutations have been shown to be driving events in many cancer types. Its role in non-small cell lung cancer, glioblastoma and basal-like breast cancers has spurred many research and drug development efforts. Tyrosine kinase inhibitors have shown efficacy in EGFR amplfied tumors, most notably gefitinib and erlotinib. Mutations in EGFR have been shown to confer resistance to these drugs, particularly the variant T790M, which has been functionally characterized as a resistance marker for both of these drugs. The later generation TKI's have seen some success in treating these resistant cases, and targeted sequencing of the EGFR locus has become a common practice in treatment of non-small cell lung cancer. \nOverproduction of ligands is another possible mechanism of activation of EGFR. ERBB ligands include EGF, TGF-a, AREG, EPG, BTC, HB-EGF, EPR and NRG1-4 (for detailed information please refer to the respective ligand section).", # noqa: E501 - "xrefs": [ - "ncbigene:1956" - ], + "description": "EGFR is widely recognized for its importance in cancer. Amplification and mutations have been shown to be driving events in many cancer types. Its role in non-small cell lung cancer, glioblastoma and basal-like breast cancers has spurred many research and drug development efforts. Tyrosine kinase inhibitors have shown efficacy in EGFR amplfied tumors, most notably gefitinib and erlotinib. Mutations in EGFR have been shown to confer resistance to these drugs, particularly the variant T790M, which has been functionally characterized as a resistance marker for both of these drugs. The later generation TKI's have seen some success in treating these resistant cases, and targeted sequencing of the EGFR locus has become a common practice in treatment of non-small cell lung cancer. \nOverproduction of ligands is another possible mechanism of activation of EGFR. ERBB ligands include EGF, TGF-a, AREG, EPG, BTC, HB-EGF, EPR and NRG1-4 (for detailed information please refer to the respective ligand section).", + "xrefs": ["ncbigene:1956"], "alternate_labels": [ "ERRP", "EGFR", @@ -453,56 +444,47 @@ def schema_extra(schema: Dict[str, Any], "NISBD2", "HER1", "ERBB1", - "ERBB" + "ERBB", ], - "gene_id": "hgnc:3236" - } + "gene_id": "hgnc:3236", + }, }, "therapy_descriptor": { "id": "civic.tid:146", "type": "TherapyDescriptor", "label": "Afatinib", - "xrefs": [ - "ncit:C66940" - ], + "xrefs": ["ncit:C66940"], "alternate_labels": [ "BIBW2992", "BIBW 2992", - "(2e)-N-(4-(3-Chloro-4-Fluoroanilino)-7-(((3s)-Oxolan-3-yl)Oxy)Quinoxazolin-6-yl)-4-(Dimethylamino)But-2-Enamide" # noqa: E501 + "(2e)-N-(4-(3-Chloro-4-Fluoroanilino)-7-(((3s)-Oxolan-3-yl)Oxy)Quinoxazolin-6-yl)-4-(Dimethylamino)But-2-Enamide", ], - "therapy_id": "rxcui:1430438" + "therapy_id": "rxcui:1430438", }, "disease_descriptor": { "id": "civic.did:8", "type": "DiseaseDescriptor", "label": "Lung Non-small Cell Carcinoma", - "xrefs": [ - "DOID:3908" - ], - "disease_id": "ncit:C2926" + "xrefs": ["DOID:3908"], + "disease_id": "ncit:C2926", }, "method": { "id": "method:001", - "label": "Standard operating procedure for curation and clinical interpretation of variants in cancer", # noqa: E501 - "url": "https://genomemedicine.biomedcentral.com/articles/10.1186/s13073-019-0687-x", # noqa: E501 - "version": { - "year": 2019, - "month": 11, - "day": 29 - }, - "authors": - "Danos, A.M., Krysiak, K., Barnell, E.K. et al.", - "type": "Method" + "label": "Standard operating procedure for curation and clinical interpretation of variants in cancer", + "url": "https://genomemedicine.biomedcentral.com/articles/10.1186/s13073-019-0687-x", + "version": {"year": 2019, "month": 11, "day": 29}, + "authors": "Danos, A.M., Krysiak, K., Barnell, E.K. et al.", + "type": "Method", }, "supported_by": [ { "id": "pmid:23982599", "label": "Dungo et al., 2013, Drugs", "description": "Afatinib: first global approval.", - "type": "Document" + "type": "Document", } ], - "type": "Statement" + "type": "Statement", } @@ -520,19 +502,18 @@ class Config: """Configure examples.""" @staticmethod - def schema_extra(schema: Dict[str, Any], - model: Type['SearchQuery']) -> None: + def schema_extra(schema: Dict[str, Any], model: Type["SearchQuery"]) -> None: """Configure OpenAPI schema""" - if 'title' in schema.keys(): - schema.pop('title', None) - for prop in schema.get('properties', {}).values(): - prop.pop('title', None) - schema['example'] = { + if "title" in schema.keys(): + schema.pop("title", None) + for prop in schema.get("properties", {}).values(): + prop.pop("title", None) + schema["example"] = { "variation": "NP_005219.2:p.Leu858Arg", "disease": "Lung Non-small Cell Carcinoma", "therapy": "Afatinib", "statement_id": "civic.eid:2997", - "detail": False + "detail": False, } @@ -549,18 +530,19 @@ class Config: """Configure examples.""" @staticmethod - def schema_extra(schema: Dict[str, Any], - model: Type['SearchStatementsQuery']) -> None: + def schema_extra( + schema: Dict[str, Any], model: Type["SearchStatementsQuery"] + ) -> None: """Configure OpenAPI schema""" - if 'title' in schema.keys(): - schema.pop('title', None) - for prop in schema.get('properties', {}).values(): - prop.pop('title', None) - schema['example'] = { + if "title" in schema.keys(): + schema.pop("title", None) + for prop in schema.get("properties", {}).values(): + prop.pop("title", None) + schema["example"] = { "variation": "NP_005219.2:p.Leu858Arg", "disease": "Lung Non-small Cell Carcinoma", "therapy": "Afatinib", - "statement_id": "civic.eid:2997" + "statement_id": "civic.eid:2997", } @@ -574,16 +556,15 @@ class Config: """Configure examples.""" @staticmethod - def schema_extra(schema: Dict[str, Any], - model: Type['Matches']) -> None: + def schema_extra(schema: Dict[str, Any], model: Type["Matches"]) -> None: """Configure OpenAPI schema""" - if 'title' in schema.keys(): - schema.pop('title', None) - for prop in schema.get('properties', {}).values(): - prop.pop('title', None) - schema['example'] = { + if "title" in schema.keys(): + schema.pop("title", None) + for prop in schema.get("properties", {}).values(): + prop.pop("title", None) + schema["example"] = { "statements": ["civic.eid:2997"], - "propositions": ["proposition:133"] + "propositions": ["proposition:133"], } @@ -598,8 +579,7 @@ class Config: """Configure schema example.""" @staticmethod - def schema_extra(schema: Dict[str, Any], - model: Type["ServiceMeta"]) -> None: + def schema_extra(schema: Dict[str, Any], model: Type["ServiceMeta"]) -> None: """Configure OpenAPI schema""" if "title" in schema.keys(): schema.pop("title", None) @@ -608,7 +588,7 @@ def schema_extra(schema: Dict[str, Any], schema["example"] = { "name": "metakb", "version": "1.1.0-alpha.4", - "url": "https://github.com/cancervariants/metakb" + "url": "https://github.com/cancervariants/metakb", } @@ -619,9 +599,15 @@ class SearchService(BaseModel): warnings: Optional[List[str]] matches: Matches statements: Optional[List[StatementResponse]] - propositions: Optional[List[Union[TherapeuticResponseProposition, - DiagnosticProposition, - PrognosticProposition]]] + propositions: Optional[ + List[ + Union[ + TherapeuticResponseProposition, + DiagnosticProposition, + PrognosticProposition, + ] + ] + ] variation_descriptors: Optional[List[VariationDescriptor]] gene_descriptors: Optional[List[GeneDescriptor]] therapy_descriptors: Optional[List[ValueObjectDescriptor]] @@ -634,30 +620,29 @@ class Config: """Configure examples.""" @staticmethod - def schema_extra(schema: Dict[str, Any], - model: Type['SearchService']) -> None: + def schema_extra(schema: Dict[str, Any], model: Type["SearchService"]) -> None: """Configure OpenAPI schema""" - if 'title' in schema.keys(): - schema.pop('title', None) - for prop in schema.get('properties', {}).values(): - prop.pop('title', None) - schema['example'] = { + if "title" in schema.keys(): + schema.pop("title", None) + for prop in schema.get("properties", {}).values(): + prop.pop("title", None) + schema["example"] = { "query": { "variation": "EGFR L858R", "disease": "Lung Non-small Cell Carcinoma", "therapy": "Afatinib", "statement_id": "civic.eid:2997", - "detail": False + "detail": False, }, "warnings": [], "matches": { "statements": ["civic.eid:2997"], - "propositions": ["proposition:109"] + "propositions": ["proposition:109"], }, "statements": [ { "id": "civic.eid:2997", - "description": "Afatinib, an irreversible inhibitor of the ErbB family of tyrosine kinases has been approved in the US for the first-line treatment of patients with metastatic non-small-cell lung cancer (NSCLC) who have tumours with EGFR exon 19 deletions or exon 21 (L858R) substitution mutations as detected by a US FDA-approved test", # noqa: E501 + "description": "Afatinib, an irreversible inhibitor of the ErbB family of tyrosine kinases has been approved in the US for the first-line treatment of patients with metastatic non-small-cell lung cancer (NSCLC) who have tumours with EGFR exon 19 deletions or exon 21 (L858R) substitution mutations as detected by a US FDA-approved test", "direction": "supports", "evidence_level": "civic.evidence_level:A", "variation_origin": "somatic", @@ -666,10 +651,8 @@ def schema_extra(schema: Dict[str, Any], "therapy_descriptor": "civic.tid:146", "disease_descriptor": "civic.did:8", "method": "method:001", - "supported_by": [ - "pmid:23982599" - ], - "type": "Statement" + "supported_by": ["pmid:23982599"], + "type": "Statement", } ], "propositions": [ @@ -679,15 +662,15 @@ def schema_extra(schema: Dict[str, Any], "predicate": "predicts_sensitivity_to", "subject": "ga4gh:VA.kgjrhgf84CEndyLjKdAO0RxN-e3pJjxA", "object_qualifier": "ncit:C2926", - "object": "rxcui:1430438" + "object": "rxcui:1430438", } ], "service_meta_": { "name": "metakb", "version": "1.1.0-alpha.4", "last_updated": "2021-12-16", - "url": "https://github.com/cancervariants/metakb" - } + "url": "https://github.com/cancervariants/metakb", + }, } @@ -697,9 +680,11 @@ class SearchIDService(BaseModel): query: str warnings: Optional[List[str]] statement: Optional[StatementResponse] - proposition: Optional[Union[TherapeuticResponseProposition, - DiagnosticProposition, - PrognosticProposition]] + proposition: Optional[ + Union[ + TherapeuticResponseProposition, DiagnosticProposition, PrognosticProposition + ] + ] variation_descriptor: Optional[VariationDescriptor] gene_descriptor: Optional[GeneDescriptor] therapy_descriptor: Optional[ValueObjectDescriptor] @@ -712,53 +697,45 @@ class Config: """Configure examples.""" @staticmethod - def schema_extra(schema: Dict[str, Any], - model: Type['SearchIDService']) -> None: + def schema_extra( + schema: Dict[str, Any], model: Type["SearchIDService"] + ) -> None: """Configure OpenAPI schema""" - if 'title' in schema.keys(): - schema.pop('title', None) - for prop in schema.get('properties', {}).values(): - prop.pop('title', None) - schema['example'] = { - "query": { - "node_id": "civic.vid:33" - }, + if "title" in schema.keys(): + schema.pop("title", None) + for prop in schema.get("properties", {}).values(): + prop.pop("title", None) + schema["example"] = { + "query": {"node_id": "civic.vid:33"}, "warnings": [], - "matches": { - "node": "civic.vid:33" - }, + "matches": {"node": "civic.vid:33"}, "variation_descriptors": [ { "id": "civic.vid:33", "type": "VariationDescriptor", "label": "L858R", - "value_id": "ga4gh:VA.WyOqFMhc8aOnMFgdY0uM7nSLNqxVPAiR", # noqa: E501 + "value_id": "ga4gh:VA.WyOqFMhc8aOnMFgdY0uM7nSLNqxVPAiR", "value": { "location": { "interval": { "end": 858, "start": 857, - "type": "SimpleInterval" + "type": "SimpleInterval", }, - "sequence_id": "ga4gh:SQ.vyo55F6mA6n2LgN4cagcdRzOuh38V4mE", # noqa: E501 - "type": "SequenceLocation" - }, - "state": { - "sequence": "R", - "type": "SequenceState" + "sequence_id": "ga4gh:SQ.vyo55F6mA6n2LgN4cagcdRzOuh38V4mE", + "type": "SequenceLocation", }, - "type": "Allele" + "state": {"sequence": "R", "type": "SequenceState"}, + "type": "Allele", }, "xrefs": [ "clinvar:376280", "clinvar:16609", "clinvar:376282", "caid:CA126713", - "dbsnp:121434568" - ], - "alternate_labels": [ - "LEU858ARG" + "dbsnp:121434568", ], + "alternate_labels": ["LEU858ARG"], "extensions": [ { "name": "civic_representative_coordinate", @@ -768,12 +745,12 @@ def schema_extra(schema: Dict[str, Any], "stop": 55259515, "reference_bases": "T", "variant_bases": "G", - "representative_transcript": "ENST00000275493.2", # noqa: E501 + "representative_transcript": "ENST00000275493.2", "ensembl_version": 75, "reference_build": "GRCh37", - "type": "coordinates" + "type": "coordinates", }, - "type": "Extension" + "type": "Extension", } ], "structural_type": "SO:0001583", @@ -781,33 +758,33 @@ def schema_extra(schema: Dict[str, Any], { "syntax": "hgvs.g", "value": "NC_000007.13:g.55259515T>G", - "type": "Expression" + "type": "Expression", }, { "syntax": "hgvs.p", "value": "NP_005219.2:p.Leu858Arg", - "type": "Expression" + "type": "Expression", }, { "syntax": "hgvs.c", "value": "NM_005228.4:c.2573T>G", - "type": "Expression" + "type": "Expression", }, { "syntax": "hgvs.c", "value": "ENST00000275493.2:c.2573T>G", - "type": "Expression" - } + "type": "Expression", + }, ], - "gene_context": "civic.gid:19" + "gene_context": "civic.gid:19", } ], "service_meta_": { "name": "metakb", "version": "1.1.0-alpha.4", "last_updated": "2021-12-16", - "url": "https://github.com/cancervariants/metakb" - } + "url": "https://github.com/cancervariants/metakb", + }, } @@ -824,29 +801,30 @@ class Config: """Configure examples.""" @staticmethod - def schema_extra(schema: Dict[str, Any], - model: Type['SearchStatementsService']) -> None: + def schema_extra( + schema: Dict[str, Any], model: Type["SearchStatementsService"] + ) -> None: """Configure OpenAPI schema""" - if 'title' in schema.keys(): - schema.pop('title', None) - for prop in schema.get('properties', {}).values(): - prop.pop('title', None) - schema['example'] = { + if "title" in schema.keys(): + schema.pop("title", None) + for prop in schema.get("properties", {}).values(): + prop.pop("title", None) + schema["example"] = { "query": { "variation": "EGFR L858R", "disease": "Lung Non-small Cell Carcinoma", "therapy": "Afatinib", - "statement_id": "civic.eid:2997" + "statement_id": "civic.eid:2997", }, "warnings": [], "matches": { "statements": ["civic.eid:2997"], - "propositions": ["proposition:109"] + "propositions": ["proposition:109"], }, "statements": [ { "id": "civic.eid:2997", - "description": "Afatinib, an irreversible inhibitor of the ErbB family of tyrosine kinases has been approved in the US for the first-line treatment of patients with metastatic non-small-cell lung cancer (NSCLC) who have tumours with EGFR exon 19 deletions or exon 21 (L858R) substitution mutations as detected by a US FDA-approved test", # noqa: E501 + "description": "Afatinib, an irreversible inhibitor of the ErbB family of tyrosine kinases has been approved in the US for the first-line treatment of patients with metastatic non-small-cell lung cancer (NSCLC) who have tumours with EGFR exon 19 deletions or exon 21 (L858R) substitution mutations as detected by a US FDA-approved test", "direction": "supports", "evidence_level": "civic.evidence_level:A", "variation_origin": "somatic", @@ -854,9 +832,9 @@ def schema_extra(schema: Dict[str, Any], "id": "proposition:133", "type": "therapeutic_response_proposition", "predicate": "predicts_sensitivity_to", - "subject": "ga4gh:VA.kgjrhgf84CEndyLjKdAO0RxN-e3pJjxA", # noqa: E501 + "subject": "ga4gh:VA.kgjrhgf84CEndyLjKdAO0RxN-e3pJjxA", "object_qualifier": "ncit:C2926", - "object": "rxcui:1430438" + "object": "rxcui:1430438", }, "variation_descriptor": { "id": "civic.vid:33", @@ -867,11 +845,9 @@ def schema_extra(schema: Dict[str, Any], "clinvar:16609", "clinvar:376282", "caid:CA126713", - "dbsnp:121434568" - ], - "alternate_labels": [ - "LEU858ARG" + "dbsnp:121434568", ], + "alternate_labels": ["LEU858ARG"], "extensions": [ { "type": "Extension", @@ -882,71 +858,61 @@ def schema_extra(schema: Dict[str, Any], "stop": 55259515, "reference_bases": "T", "variant_bases": "G", - "representative_transcript": - "ENST00000275493.2", + "representative_transcript": "ENST00000275493.2", "ensembl_version": 75, "reference_build": "GRCh37", - "type": "coordinates" - } + "type": "coordinates", + }, } ], - "variation_id": - "ga4gh:VA.kgjrhgf84CEndyLjKdAO0RxN-e3pJjxA", + "variation_id": "ga4gh:VA.kgjrhgf84CEndyLjKdAO0RxN-e3pJjxA", "variation": { - "_id": "ga4gh:VA.kgjrhgf84CEndyLjKdAO0RxN-e3pJjxA", # noqa: E501 + "_id": "ga4gh:VA.kgjrhgf84CEndyLjKdAO0RxN-e3pJjxA", "type": "Allele", "location": { - "_id": "ga4gh:VSL.Sfs_3PlVEYp9BxBsHsFfU1tvhfDq361f", # noqa: E501 + "_id": "ga4gh:VSL.Sfs_3PlVEYp9BxBsHsFfU1tvhfDq361f", "type": "SequenceLocation", - "sequence_id": "ga4gh:SQ.vyo55F6mA6n2LgN4cagcdRzOuh38V4mE", # noqa: E501 + "sequence_id": "ga4gh:SQ.vyo55F6mA6n2LgN4cagcdRzOuh38V4mE", "interval": { "type": "SequenceInterval", - "start": { - "type": "Number", - "value": 857 - }, - "end": { - "type": "Number", - "value": 858 - } - } + "start": {"type": "Number", "value": 857}, + "end": {"type": "Number", "value": 858}, + }, }, "state": { "type": "LiteralSequenceExpression", - "sequence": "R" - } + "sequence": "R", + }, }, "structural_type": "SO:0001583", "expressions": [ { "type": "Expression", "syntax": "hgvs.g", - "value": "NC_000007.13:g.55259515T>G" + "value": "NC_000007.13:g.55259515T>G", }, { "type": "Expression", "syntax": "hgvs.p", - "value": "NP_005219.2:p.Leu858Arg" + "value": "NP_005219.2:p.Leu858Arg", }, { "type": "Expression", "syntax": "hgvs.c", - "value": "NM_005228.4:c.2573T>G" + "value": "NM_005228.4:c.2573T>G", }, { "type": "Expression", "syntax": "hgvs.c", - "value": "ENST00000275493.2:c.2573T>G" - } + "value": "ENST00000275493.2:c.2573T>G", + }, ], "gene_context": { "id": "civic.gid:19", "type": "GeneDescriptor", "label": "EGFR", - "description": "EGFR is widely recognized for its importance in cancer. Amplification and mutations have been shown to be driving events in many cancer types. Its role in non-small cell lung cancer, glioblastoma and basal-like breast cancers has spurred many research and drug development efforts. Tyrosine kinase inhibitors have shown efficacy in EGFR amplfied tumors, most notably gefitinib and erlotinib. Mutations in EGFR have been shown to confer resistance to these drugs, particularly the variant T790M, which has been functionally characterized as a resistance marker for both of these drugs. The later generation TKI's have seen some success in treating these resistant cases, and targeted sequencing of the EGFR locus has become a common practice in treatment of non-small cell lung cancer. \nOverproduction of ligands is another possible mechanism of activation of EGFR. ERBB ligands include EGF, TGF-a, AREG, EPG, BTC, HB-EGF, EPR and NRG1-4 (for detailed information please refer to the respective ligand section).", # noqa: E501 - "xrefs": [ - "ncbigene:1956" - ], + "description": "EGFR is widely recognized for its importance in cancer. Amplification and mutations have been shown to be driving events in many cancer types. Its role in non-small cell lung cancer, glioblastoma and basal-like breast cancers has spurred many research and drug development efforts. Tyrosine kinase inhibitors have shown efficacy in EGFR amplfied tumors, most notably gefitinib and erlotinib. Mutations in EGFR have been shown to confer resistance to these drugs, particularly the variant T790M, which has been functionally characterized as a resistance marker for both of these drugs. The later generation TKI's have seen some success in treating these resistant cases, and targeted sequencing of the EGFR locus has become a common practice in treatment of non-small cell lung cancer. \nOverproduction of ligands is another possible mechanism of activation of EGFR. ERBB ligands include EGF, TGF-a, AREG, EPG, BTC, HB-EGF, EPR and NRG1-4 (for detailed information please refer to the respective ligand section).", + "xrefs": ["ncbigene:1956"], "alternate_labels": [ "ERRP", "EGFR", @@ -955,61 +921,53 @@ def schema_extra(schema: Dict[str, Any], "NISBD2", "HER1", "ERBB1", - "ERBB" + "ERBB", ], - "gene_id": "hgnc:3236" - } + "gene_id": "hgnc:3236", + }, }, "therapy_descriptor": { "id": "civic.tid:146", "type": "TherapyDescriptor", "label": "Afatinib", - "xrefs": [ - "ncit:C66940" - ], + "xrefs": ["ncit:C66940"], "alternate_labels": [ "BIBW2992", "BIBW 2992", - "(2e)-N-(4-(3-Chloro-4-Fluoroanilino)-7-(((3s)-Oxolan-3-yl)Oxy)Quinoxazolin-6-yl)-4-(Dimethylamino)But-2-Enamide" # noqa: E501 + "(2e)-N-(4-(3-Chloro-4-Fluoroanilino)-7-(((3s)-Oxolan-3-yl)Oxy)Quinoxazolin-6-yl)-4-(Dimethylamino)But-2-Enamide", ], - "therapy_id": "rxcui:1430438" + "therapy_id": "rxcui:1430438", }, "disease_descriptor": { "id": "civic.did:8", "type": "DiseaseDescriptor", "label": "Lung Non-small Cell Carcinoma", - "xrefs": [ - "DOID:3908" - ], - "disease_id": "ncit:C2926" + "xrefs": ["DOID:3908"], + "disease_id": "ncit:C2926", }, "method": { "id": "method:001", - "label": "Standard operating procedure for curation and clinical interpretation of variants in cancer", # noqa: E501 - "url": "https://genomemedicine.biomedcentral.com/articles/10.1186/s13073-019-0687-x", # noqa: E501 - "version": { - "year": 2019, - "month": 11, - "day": 29 - }, - "authors": "Danos, A.M., Krysiak, K., Barnell, E.K. et al.", # noqa: E501 - "type": "Method" + "label": "Standard operating procedure for curation and clinical interpretation of variants in cancer", + "url": "https://genomemedicine.biomedcentral.com/articles/10.1186/s13073-019-0687-x", + "version": {"year": 2019, "month": 11, "day": 29}, + "authors": "Danos, A.M., Krysiak, K., Barnell, E.K. et al.", + "type": "Method", }, "supported_by": [ { "id": "pmid:23982599", "label": "Dungo et al., 2013, Drugs", - "description": "Afatinib: first global approval.", # noqa: E501 - "type": "Document" + "description": "Afatinib: first global approval.", + "type": "Document", } ], - "type": "Statement" + "type": "Statement", } ], "service_meta_": { "name": "metakb", "version": "1.1.0-alpha.4", "last_updated": "2021-12-16", - "url": "https://github.com/cancervariants/metakb" - } + "url": "https://github.com/cancervariants/metakb", + }, } diff --git a/metakb/transform/__init__.py b/metakb/transform/__init__.py index 34ffa190..07041bf6 100644 --- a/metakb/transform/__init__.py +++ b/metakb/transform/__init__.py @@ -1,5 +1,7 @@ """Transformations for sources.""" -from .base import Transform # noqa: F401 -from .civic import CIViCTransform # noqa: F401 -from .moa import MOATransform # noqa: F401 -from .oncokb import OncoKBTransform # noqa: F401 +from .base import Transform +from .civic import CIViCTransform +from .moa import MOATransform +from .oncokb import OncoKBTransform + +__all__ = ["Transform", "CIViCTransform", "MOATransform", "OncoKBTransform"] diff --git a/metakb/transform/base.py b/metakb/transform/base.py index 947b13fe..e1f8d971 100644 --- a/metakb/transform/base.py +++ b/metakb/transform/base.py @@ -1,31 +1,41 @@ """A module for the Transform base class.""" -from typing import Dict, Optional, List import json -import canonicaljson import logging -from pathlib import Path from datetime import datetime as dt +from pathlib import Path +from typing import Dict, List, Optional +import canonicaljson from ga4gh.core import sha512t24u from metakb import APP_ROOT, DATE_FMT -from metakb.schemas import DiagnosticProposition, PrognosticProposition, \ - PropositionType, Predicate, DiagnosticPredicate, \ - PrognosticPredicate, PredictivePredicate, FunctionalPredicate, \ - PathogenicPredicate, TherapeuticResponseProposition from metakb.normalizers import VICCNormalizers - -logger = logging.getLogger('metakb') +from metakb.schemas import ( + DiagnosticPredicate, + DiagnosticProposition, + FunctionalPredicate, + PathogenicPredicate, + Predicate, + PredictivePredicate, + PrognosticPredicate, + PrognosticProposition, + PropositionType, + TherapeuticResponseProposition, +) + +logger = logging.getLogger("metakb") logger.setLevel(logging.DEBUG) class Transform: """A base class for transforming harvester data.""" - def __init__(self, - data_dir: Path = APP_ROOT / "data", - harvester_path: Optional[Path] = None, - normalizers: Optional[VICCNormalizers] = None) -> None: + def __init__( + self, + data_dir: Path = APP_ROOT / "data", + harvester_path: Optional[Path] = None, + normalizers: Optional[VICCNormalizers] = None, + ) -> None: """Initialize Transform base class. :param Path data_dir: Path to source data directory @@ -84,7 +94,7 @@ def extract_harvester(self) -> Dict[str, List]: PropositionType.DIAGNOSTIC: DiagnosticPredicate, PropositionType.PROGNOSTIC: PrognosticPredicate, PropositionType.PATHOGENIC: PathogenicPredicate, - PropositionType.FUNCTIONAL: FunctionalPredicate + PropositionType.FUNCTIONAL: FunctionalPredicate, } @staticmethod @@ -103,7 +113,7 @@ def _get_proposition_id( pred: Predicate, variation_ids: List[str] = [], disease_ids: List[str] = [], - therapy_ids: List[str] = [] + therapy_ids: List[str] = [], ) -> Optional[str]: """Retrieve stable ID for a proposition @@ -128,8 +138,12 @@ def _get_proposition_id( return f"proposition:{digest}" def _get_proposition( - self, proposition_type: PropositionType, predicate: Predicate, subject: str, - object_qualifier: str, object: Optional[str] = None + self, + proposition_type: PropositionType, + predicate: Predicate, + subject: str, + object_qualifier: str, + object: Optional[str] = None, ) -> Optional[Dict]: """Get proposition parameters. Updates the `propositions` instance variable if proposition params were successfully created. @@ -147,7 +161,7 @@ def _get_proposition( "type": proposition_type, "predicate": predicate, "subject": subject, - "object_qualifier": object_qualifier + "object_qualifier": object_qualifier, } if proposition_type == PropositionType.PREDICTIVE: @@ -157,14 +171,14 @@ def _get_proposition( params["predicate"], [params["subject"]], [params["object_qualifier"]], - [params["object"]] + [params["object"]], ) else: proposition_id = self._get_proposition_id( params["type"], params["predicate"], [params["subject"]], - [params["object_qualifier"]] + [params["object_qualifier"]], ) if proposition_id is None: return None @@ -175,7 +189,8 @@ def _get_proposition( proposition = PrognosticProposition(**params).dict(exclude_none=True) elif proposition_type == PropositionType.PREDICTIVE.value: proposition = TherapeuticResponseProposition(**params).dict( - exclude_none=True) + exclude_none=True + ) elif proposition_type == PropositionType.DIAGNOSTIC.value: proposition = DiagnosticProposition(**params).dict(exclude_none=True) else: @@ -198,8 +213,9 @@ def _get_document_id(**parameters) -> str: blob = json.dumps(params_sorted).encode("ascii") return f"document:{sha512t24u(blob=blob)}" - def create_json(self, transform_dir: Optional[Path] = None, - filename: Optional[str] = None) -> None: + def create_json( + self, transform_dir: Optional[Path] = None, filename: Optional[str] = None + ) -> None: """Create a composite JSON for transformed data. :param Optional[Path] transform_dir: Path to data directory for @@ -211,19 +227,19 @@ def create_json(self, transform_dir: Optional[Path] = None, transform_dir.mkdir(exist_ok=True, parents=True) composite_dict = { - 'statements': self.statements, - 'propositions': self.propositions, - 'variation_descriptors': self.variation_descriptors, - 'gene_descriptors': self.gene_descriptors, - 'therapy_descriptors': self.therapy_descriptors, - 'disease_descriptors': self.disease_descriptors, - 'methods': self.methods, - 'documents': self.documents + "statements": self.statements, + "propositions": self.propositions, + "variation_descriptors": self.variation_descriptors, + "gene_descriptors": self.gene_descriptors, + "therapy_descriptors": self.therapy_descriptors, + "disease_descriptors": self.disease_descriptors, + "methods": self.methods, + "documents": self.documents, } today = dt.strftime(dt.today(), DATE_FMT) if filename is None: filename = f"{self.name}_cdm_{today}.json" out = transform_dir / filename - with open(out, 'w+') as f: + with open(out, "w+") as f: json.dump(composite_dict, f, indent=4) diff --git a/metakb/transform/civic.py b/metakb/transform/civic.py index 30cc4337..7f15e772 100644 --- a/metakb/transform/civic.py +++ b/metakb/transform/civic.py @@ -1,17 +1,21 @@ """A module for to transform CIViC.""" -from typing import Optional, Dict, List, Set -from pathlib import Path import logging import re +from pathlib import Path +from typing import Dict, List, Optional, Set -from ga4gh.vrsatile.pydantic.vrsatile_models import VariationDescriptor, \ - Extension, Expression, GeneDescriptor, ValueObjectDescriptor +from ga4gh.vrsatile.pydantic.vrsatile_models import ( + Expression, + Extension, + GeneDescriptor, + ValueObjectDescriptor, + VariationDescriptor, +) +import metakb.schemas as schemas from metakb import APP_ROOT from metakb.normalizers import VICCNormalizers from metakb.transform.base import Transform -import metakb.schemas as schemas - logger = logging.getLogger(__name__) @@ -19,28 +23,30 @@ class CIViCTransform(Transform): """A class for transforming CIViC to the common data model.""" - def __init__(self, - data_dir: Path = APP_ROOT / "data", - harvester_path: Optional[Path] = None, - normalizers: Optional[VICCNormalizers] = None) -> None: + def __init__( + self, + data_dir: Path = APP_ROOT / "data", + harvester_path: Optional[Path] = None, + normalizers: Optional[VICCNormalizers] = None, + ) -> None: """Initialize CIViC Transform class. :param Path data_dir: Path to source data directory :param Optional[Path] harvester_path: Path to previously harvested data :param VICCNormalizers normalizers: normalizer collection instance """ - super().__init__(data_dir=data_dir, - harvester_path=harvester_path, - normalizers=normalizers) + super().__init__( + data_dir=data_dir, harvester_path=harvester_path, normalizers=normalizers + ) # Able to normalize these IDSs self.valid_ids = { - 'variation_descriptors': dict(), - 'disease_descriptors': dict(), - 'therapy_descriptors': dict() + "variation_descriptors": dict(), + "disease_descriptors": dict(), + "therapy_descriptors": dict(), } # Unable to normalize these IDs self.invalid_ids = { - 'therapy_descriptors': list(), - 'disease_descriptors': list() + "therapy_descriptors": list(), + "disease_descriptors": list(), } @staticmethod @@ -63,18 +69,20 @@ def _mp_to_variant_mapping(molecular_profiles: List[Dict]) -> Dict: else: mapping[mp_id] = mp_variant_ids[0] - logger.debug(f"{len(not_supported_mps)} Molecular Profiles not supported: " - f"{not_supported_mps}") + logger.debug( + f"{len(not_supported_mps)} Molecular Profiles not supported: " + f"{not_supported_mps}" + ) return mapping async def transform(self): """Transform CIViC harvested json to common data model.""" data = self.extract_harvester() - evidence_items = data['evidence'] - assertions = data['assertions'] + evidence_items = data["evidence"] + assertions = data["assertions"] molecular_profiles = data["molecular_profiles"] - variants = data['variants'] - genes = data['genes'] + variants = data["variants"] + genes = data["genes"] mp_id_to_v_id_mapping = self._mp_to_variant_mapping(molecular_profiles) # Only want evidence and assertions with approved status @@ -82,16 +90,28 @@ async def transform(self): assertions = [a for a in assertions if a["status"] == "accepted"] # Filter Variant IDs for Prognostic, Predictive, and Diagnostic evidence - supported_evidence_types = ['PROGNOSTIC', 'PREDICTIVE', 'DIAGNOSTIC'] - evidence_items = [e for e in evidence_items - if e["evidence_type"].upper() in supported_evidence_types] - assertions = [a for a in assertions - if a["assertion_type"].upper() in supported_evidence_types] + supported_evidence_types = ["PROGNOSTIC", "PREDICTIVE", "DIAGNOSTIC"] + evidence_items = [ + e + for e in evidence_items + if e["evidence_type"].upper() in supported_evidence_types + ] + assertions = [ + a + for a in assertions + if a["assertion_type"].upper() in supported_evidence_types + ] - vids = {mp_id_to_v_id_mapping[e["molecular_profile_id"]] - for e in evidence_items if e["molecular_profile_id"]} - vids |= {mp_id_to_v_id_mapping[a["molecular_profile_id"]] - for a in assertions if a["molecular_profile_id"]} + vids = { + mp_id_to_v_id_mapping[e["molecular_profile_id"]] + for e in evidence_items + if e["molecular_profile_id"] + } + vids |= { + mp_id_to_v_id_mapping[a["molecular_profile_id"]] + for a in assertions + if a["molecular_profile_id"] + } await self._add_variation_descriptors(variants, vids) self._add_gene_descriptors(genes) @@ -114,18 +134,18 @@ def _transform_evidence_and_assertions( `False` if records are assertions. """ for r in records: - name_lower = r['name'].lower() - if name_lower.startswith('eid'): - civic_id = name_lower.replace('eid', 'civic.eid:') + name_lower = r["name"].lower() + if name_lower.startswith("eid"): + civic_id = name_lower.replace("eid", "civic.eid:") else: - civic_id = name_lower.replace('aid', 'civic.aid:') + civic_id = name_lower.replace("aid", "civic.aid:") record_type = r["evidence_type"] if is_evidence else r["assertion_type"] if record_type not in ["PREDICTIVE", "PROGNOSTIC", "DIAGNOSTIC"]: continue else: # Functional Evidence types do not have a disease - if not r['disease']: + if not r["disease"]: continue if record_type == "PREDICTIVE": @@ -133,8 +153,7 @@ def _transform_evidence_and_assertions( continue else: therapy_id = f"civic.tid:{r['therapies'][0]['id']}" - therapy_descriptor = \ - self._add_therapy_descriptor(therapy_id, r) + therapy_descriptor = self._add_therapy_descriptor(therapy_id, r) if not therapy_descriptor: continue @@ -153,8 +172,9 @@ def _transform_evidence_and_assertions( self.disease_descriptors.append(disease_descriptor) variant_id = f"civic.vid:{mp_id_to_v_id_mapping[r['molecular_profile_id']]}" - variation_descriptor = \ - self.valid_ids['variation_descriptors'].get(variant_id) + variation_descriptor = self.valid_ids["variation_descriptors"].get( + variant_id + ) if not variation_descriptor: continue @@ -174,8 +194,12 @@ def _transform_evidence_and_assertions( object = therapy_descriptor["therapy_id"] proposition = self._get_proposition( - proposition_type, predicate, variation_descriptor["variation_id"], - disease_descriptor["disease_id"], object) + proposition_type, + predicate, + variation_descriptor["variation_id"], + disease_descriptor["disease_id"], + object, + ) # Only support Therapeutic Response and Prognostic if not proposition: @@ -183,23 +207,20 @@ def _transform_evidence_and_assertions( if is_evidence: # Evidence items's method and evidence level - method = f'method:{schemas.MethodID.CIVIC_EID_SOP}' + method = f"method:{schemas.MethodID.CIVIC_EID_SOP}" evidence_level = f"civic.evidence_level:{r['evidence_level']}" # Supported by evidence for evidence item - document = self._get_eid_document(r['source']) + document = self._get_eid_document(r["source"]) if document not in self.documents: self.documents.append(document) - supported_by = [document['id']] + supported_by = [document["id"]] else: # Assertion's method - if r['amp_level'] and not r['acmg_codes']: - method = \ - f'method:' \ - f'{schemas.MethodID.CIVIC_AID_AMP_ASCO_CAP}' - elif not r['amp_level'] and r['acmg_codes']: - method = f'method:' \ - f'{schemas.MethodID.CIVIC_AID_ACMG}' + if r["amp_level"] and not r["acmg_codes"]: + method = f"method:" f"{schemas.MethodID.CIVIC_AID_AMP_ASCO_CAP}" + elif not r["amp_level"] and r["acmg_codes"]: + method = f"method:" f"{schemas.MethodID.CIVIC_AID_ACMG}" else: # Statements are required to have a method logger.warning(f"Unable to get method for {civic_id}") @@ -210,31 +231,29 @@ def _transform_evidence_and_assertions( # Supported by evidence for assertion supported_by = list() - documents = \ - self._get_aid_document(r) + documents = self._get_aid_document(r) for d in documents: if d not in self.documents: self.documents.append(d) - supported_by.append(d['id']) + supported_by.append(d["id"]) for ev_id in r["evidence_ids"]: supported_by.append(f"civic.eid:{ev_id}") statement = schemas.Statement( id=civic_id, - description=r['description'], + description=r["description"], direction=self._get_evidence_direction( r["evidence_direction"] if is_evidence else r["assertion_direction"] ), evidence_level=evidence_level, - proposition=proposition['id'], - variation_origin=self._get_variation_origin( - r['variant_origin']), + proposition=proposition["id"], + variation_origin=self._get_variation_origin(r["variant_origin"]), variation_descriptor=variant_id, therapy_descriptor=therapy_id, disease_descriptor=disease_id, method=method, - supported_by=supported_by + supported_by=supported_by, ).dict(exclude_none=True) self.statements.append(statement) @@ -260,8 +279,8 @@ def _get_assertion_evidence_level(self, assertion) -> Optional[str]: """ evidence_level = None # TODO: CHECK - if assertion['amp_level']: - if assertion['amp_level'] == 'Not Applicable': + if assertion["amp_level"]: + if assertion["amp_level"] == "Not Applicable": evidence_level = None else: amp_level = assertion["amp_level"] @@ -272,24 +291,23 @@ def _get_assertion_evidence_level(self, assertion) -> Optional[str]: tier = match["tier"] level = match["level"] - if tier == 'I': + if tier == "I": tier = 1 - elif tier == 'II': + elif tier == "II": tier = 2 - elif tier == 'III': + elif tier == "III": tier = 3 - elif tier == 'IV': + elif tier == "IV": tier = 4 - evidence_level = f"amp_asco_cap_2017_level:" \ - f"{tier}{level}" + evidence_level = f"amp_asco_cap_2017_level:" f"{tier}{level}" else: raise Exception(f"{amp_level} not supported with regex") return evidence_level - def _get_proposition_type(self, - evidence_type, - is_evidence=True) -> Optional[schemas.PropositionType]: # noqa: E501 + def _get_proposition_type( + self, evidence_type, is_evidence=True + ) -> Optional[schemas.PropositionType]: """Return proposition type for a given EID or AID. :param str evidence_type: CIViC evidence type @@ -298,7 +316,7 @@ def _get_proposition_type(self, """ evidence_type = evidence_type.upper() if evidence_type in schemas.PropositionType.__members__.keys(): - if evidence_type == 'PREDISPOSING': + if evidence_type == "PREDISPOSING": if is_evidence: proposition_type = schemas.PropositionType.PREDISPOSING else: @@ -306,8 +324,10 @@ def _get_proposition_type(self, else: proposition_type = schemas.PropositionType[evidence_type] else: - raise KeyError(f"Proposition Type {evidence_type} not found in " - f"schemas.PropositionType") + raise KeyError( + f"Proposition Type {evidence_type} not found in " + f"schemas.PropositionType" + ) return proposition_type def _get_variation_origin(self, variant_origin) -> Optional[str]: @@ -327,32 +347,31 @@ def _get_variation_origin(self, variant_origin) -> Optional[str]: origin = None return origin - def _get_predicate(self, proposition_type, - clin_sig) -> Optional[schemas.Predicate]: + def _get_predicate(self, proposition_type, clin_sig) -> Optional[schemas.Predicate]: """Return predicate for an evidence item. :param str proposition_type: The proposition type :param str clin_sig: The evidence item's clinical significance :return: Predicate for proposition if valid """ - if clin_sig is None or clin_sig.upper() in ['N/A', 'UNKNOWN']: + if clin_sig is None or clin_sig.upper() in ["N/A", "UNKNOWN"]: return None - clin_sig = '_'.join(clin_sig.upper().split()) + clin_sig = "_".join(clin_sig.upper().split()) if clin_sig == "NA": logger.info("NA predicate not supported") return None predicate = None if proposition_type == schemas.PropositionType.PREDICTIVE: - if clin_sig == 'SENSITIVITYRESPONSE': + if clin_sig == "SENSITIVITYRESPONSE": predicate = schemas.PredictivePredicate.SENSITIVITY - elif clin_sig == 'RESISTANCE': + elif clin_sig == "RESISTANCE": predicate = schemas.PredictivePredicate.RESISTANCE elif proposition_type == schemas.PropositionType.DIAGNOSTIC: predicate = schemas.DiagnosticPredicate[clin_sig] elif proposition_type == schemas.PropositionType.PROGNOSTIC: - if clin_sig == 'POSITIVE': + if clin_sig == "POSITIVE": predicate = schemas.PrognosticPredicate.BETTER_OUTCOME else: predicate = schemas.PrognosticPredicate[clin_sig] @@ -363,11 +382,13 @@ def _get_predicate(self, proposition_type, # Look into why this is pass elif proposition_type == schemas.PropositionType.PATHOGENIC: - if clin_sig in ['PATHOGENIC', 'LIKELY_PATHOGENIC']: + if clin_sig in ["PATHOGENIC", "LIKELY_PATHOGENIC"]: predicate = schemas.PathogenicPredicate.PATHOGENIC else: - logger.warning(f"CIViC proposition type: {proposition_type} " - f"not supported in Predicate schemas") + logger.warning( + f"CIViC proposition type: {proposition_type} " + f"not supported in Predicate schemas" + ) return predicate async def _add_variation_descriptors(self, variants: List, vids: Set) -> None: @@ -383,8 +404,7 @@ async def _add_variation_descriptors(self, variants: List, vids: Set) -> None: if "c." in variant["name"]: variant_name = variant["name"] if "(" in variant_name: - variant_name = \ - variant_name.replace("(", "").replace(")", "") + variant_name = variant_name.replace("(", "").replace(")", "") variant_name = variant_name.split()[-1] else: variant_name = variant["name"] @@ -396,26 +416,57 @@ async def _add_variation_descriptors(self, variants: List, vids: Set) -> None: # Filtering to speed up transformation vname_lower = variant["name"].lower() - if vname_lower.endswith("fs") or "-" in vname_lower or "/" in vname_lower: # noqa: E501 + if vname_lower.endswith("fs") or "-" in vname_lower or "/" in vname_lower: if not hgvs_exprs: - logger.warning("Variation Normalizer does not support " - f"{variant_id}: {variant_query}") + logger.warning( + "Variation Normalizer does not support " + f"{variant_id}: {variant_query}" + ) continue unable_to_normalize = { - "mutation", "exon", "overexpression", - "frameshift", "promoter", "deletion", "type", "insertion", - "expression", "duplication", "copy", "underexpression", - "number", "variation", "repeat", "rearrangement", "activation", - "expression", "mislocalization", "translocation", "wild", - "polymorphism", "frame", "shift", "loss", "function", "levels", - "inactivation", "snp", "fusion", "dup", "truncation", - "homozygosity", "gain", "phosphorylation" + "mutation", + "exon", + "overexpression", + "frameshift", + "promoter", + "deletion", + "type", + "insertion", + "expression", + "duplication", + "copy", + "underexpression", + "number", + "variation", + "repeat", + "rearrangement", + "activation", + "expression", + "mislocalization", + "translocation", + "wild", + "polymorphism", + "frame", + "shift", + "loss", + "function", + "levels", + "inactivation", + "snp", + "fusion", + "dup", + "truncation", + "homozygosity", + "gain", + "phosphorylation", } if set(vname_lower.split()) & unable_to_normalize: - logger.warning("Variation Normalizer does not support " - f"{variant_id}: {variant_query}") + logger.warning( + "Variation Normalizer does not support " + f"{variant_id}: {variant_query}" + ) continue variation_descriptor = await self.vicc_normalizers.normalize_variation( @@ -424,9 +475,11 @@ async def _add_variation_descriptors(self, variants: List, vids: Set) -> None: # Couldn't find normalized concept if not variation_descriptor: - logger.warning("Variation Normalizer unable to normalize " - f"civic.vid:{variant['id']} using query " - f"{variant_query}") + logger.warning( + "Variation Normalizer unable to normalize " + f"civic.vid:{variant['id']} using query " + f"{variant_query}" + ) continue if variant["variant_types"]: @@ -443,16 +496,15 @@ async def _add_variation_descriptors(self, variants: List, vids: Set) -> None: structural_type=structural_type, expressions=hgvs_exprs, xrefs=self._get_variant_xrefs(variant), - alternate_labels=[v_alias for v_alias in - variant["variant_aliases"] if not - v_alias.startswith("RS")], - extensions=self._get_variant_extensions(variant) + alternate_labels=[ + v_alias + for v_alias in variant["variant_aliases"] + if not v_alias.startswith("RS") + ], + extensions=self._get_variant_extensions(variant), ).dict(by_alias=True, exclude_none=True) - self.valid_ids["variation_descriptors"][variant_id] = \ - variation_descriptor - self.variation_descriptors.append( - variation_descriptor - ) + self.valid_ids["variation_descriptors"][variant_id] = variation_descriptor + self.variation_descriptors.append(variation_descriptor) def _get_variant_extensions(self, variant) -> list: """Return a list of extensions for a variant. @@ -462,9 +514,10 @@ def _get_variant_extensions(self, variant) -> list: """ return [ Extension( - name='civic_representative_coordinate', - value={k: v for k, v in variant['coordinates'].items() - if v is not None} + name="civic_representative_coordinate", + value={ + k: v for k, v in variant["coordinates"].items() if v is not None + }, ).dict(exclude_none=True) ] @@ -475,23 +528,26 @@ def _get_variant_xrefs(self, v) -> Optional[List[str]]: :return: A dictionary of xrefs """ xrefs = [] - for xref in ['clinvar_entries', 'allele_registry_id', - 'variant_aliases']: - if xref == 'clinvar_entries': - for clinvar_entry in v['clinvar_entries']: - if clinvar_entry and clinvar_entry not in ['N/A', - "NONE FOUND"]: - xrefs.append(f"{schemas.XrefSystem.CLINVAR.value}:" - f"{clinvar_entry}") - elif xref == 'allele_registry_id' and v['allele_registry_id']: - xrefs.append(f"{schemas.XrefSystem.CLINGEN.value}:" - f"{v['allele_registry_id']}") - elif xref == 'variant_aliases': - dbsnp_xrefs = [item for item in v['variant_aliases'] - if item.startswith('RS')] + for xref in ["clinvar_entries", "allele_registry_id", "variant_aliases"]: + if xref == "clinvar_entries": + for clinvar_entry in v["clinvar_entries"]: + if clinvar_entry and clinvar_entry not in ["N/A", "NONE FOUND"]: + xrefs.append( + f"{schemas.XrefSystem.CLINVAR.value}:" f"{clinvar_entry}" + ) + elif xref == "allele_registry_id" and v["allele_registry_id"]: + xrefs.append( + f"{schemas.XrefSystem.CLINGEN.value}:" f"{v['allele_registry_id']}" + ) + elif xref == "variant_aliases": + dbsnp_xrefs = [ + item for item in v["variant_aliases"] if item.startswith("RS") + ] for dbsnp_xref in dbsnp_xrefs: - xrefs.append(f"{schemas.XrefSystem.DB_SNP.value}:" - f"{dbsnp_xref.split('RS')[-1]}") + xrefs.append( + f"{schemas.XrefSystem.DB_SNP.value}:" + f"{dbsnp_xref.split('RS')[-1]}" + ) return xrefs def _get_hgvs_expr(self, variant) -> Optional[List[Dict[str, str]]]: @@ -501,17 +557,16 @@ def _get_hgvs_expr(self, variant) -> Optional[List[Dict[str, str]]]: :return: A list of hgvs expressions """ hgvs_expressions = list() - for hgvs_expr in variant['hgvs_expressions']: - if ':g.' in hgvs_expr: - syntax = 'hgvs.g' - elif ':c.' in hgvs_expr: - syntax = 'hgvs.c' + for hgvs_expr in variant["hgvs_expressions"]: + if ":g." in hgvs_expr: + syntax = "hgvs.g" + elif ":c." in hgvs_expr: + syntax = "hgvs.c" else: - syntax = 'hgvs.p' - if hgvs_expr != 'N/A': + syntax = "hgvs.p" + if hgvs_expr != "N/A": hgvs_expressions.append( - Expression(syntax=syntax, - value=hgvs_expr).dict(exclude_none=True) + Expression(syntax=syntax, value=hgvs_expr).dict(exclude_none=True) ) return hgvs_expressions @@ -523,51 +578,51 @@ def _add_gene_descriptors(self, genes) -> None: for gene in genes: gene_id = f"civic.gid:{gene['id']}" ncbigene = f"ncbigene:{gene['entrez_id']}" - queries = [ncbigene, gene['name']] + gene['aliases'] + queries = [ncbigene, gene["name"]] + gene["aliases"] - _, normalized_gene_id = \ - self.vicc_normalizers.normalize_gene(queries) + _, normalized_gene_id = self.vicc_normalizers.normalize_gene(queries) if normalized_gene_id: gene_descriptor = GeneDescriptor( id=gene_id, - label=gene['name'], - description=gene['description'] if gene['description'] else None, # noqa: E501 + label=gene["name"], + description=gene["description"] if gene["description"] else None, gene_id=normalized_gene_id, - alternate_labels=gene['aliases'], - xrefs=[ncbigene] + alternate_labels=gene["aliases"], + xrefs=[ncbigene], ).dict(exclude_none=True) self.gene_descriptors.append(gene_descriptor) else: - logger.warning(f"Gene Normalizer unable to normalize {gene_id}" - f"using queries: {queries}") + logger.warning( + f"Gene Normalizer unable to normalize {gene_id}" + f"using queries: {queries}" + ) - def _add_disease_descriptor(self, disease_id, record) \ - -> Optional[ValueObjectDescriptor]: + def _add_disease_descriptor( + self, disease_id, record + ) -> Optional[ValueObjectDescriptor]: """Add disease ID to list of valid or invalid transformations. :param str disease_id: The CIViC ID for the disease :param dict record: CIViC AID or EID :return: A disease descriptor """ - disease_descriptor = \ - self.valid_ids['disease_descriptors'].get(disease_id) + disease_descriptor = self.valid_ids["disease_descriptors"].get(disease_id) if disease_descriptor: return disease_descriptor else: disease_descriptor = None - if disease_id not in self.invalid_ids['disease_descriptors']: - disease_descriptor = \ - self._get_disease_descriptors(record['disease']) + if disease_id not in self.invalid_ids["disease_descriptors"]: + disease_descriptor = self._get_disease_descriptors(record["disease"]) if disease_descriptor: - self.valid_ids['disease_descriptors'][disease_id] = \ - disease_descriptor + self.valid_ids["disease_descriptors"][ + disease_id + ] = disease_descriptor else: - self.invalid_ids['disease_descriptors'].append(disease_id) + self.invalid_ids["disease_descriptors"].append(disease_id) return disease_descriptor - def _get_disease_descriptors(self, disease) \ - -> Optional[ValueObjectDescriptor]: + def _get_disease_descriptors(self, disease) -> Optional[ValueObjectDescriptor]: """Get a disease descriptor. :param dict disease: A CIViC disease record @@ -577,8 +632,8 @@ def _get_disease_descriptors(self, disease) \ return None disease_id = f"civic.did:{disease['id']}" - display_name = disease['display_name'] - doid = disease['doid'] + display_name = disease["display_name"] + doid = disease["doid"] if not doid: logger.debug(f"{disease_id} ({display_name}) has null DOID") @@ -589,12 +644,13 @@ def _get_disease_descriptors(self, disease) \ queries = [doid, display_name] xrefs = [doid] - _, normalized_disease_id = \ - self.vicc_normalizers.normalize_disease(queries) + _, normalized_disease_id = self.vicc_normalizers.normalize_disease(queries) if not normalized_disease_id: - logger.warning(f"Disease Normalizer unable to normalize: " - f"{disease_id} using queries {queries}") + logger.warning( + f"Disease Normalizer unable to normalize: " + f"{disease_id} using queries {queries}" + ) return None disease_descriptor = ValueObjectDescriptor( @@ -602,36 +658,37 @@ def _get_disease_descriptors(self, disease) \ type="DiseaseDescriptor", label=display_name, disease_id=normalized_disease_id, - xrefs=xrefs if xrefs else None + xrefs=xrefs if xrefs else None, ).dict(exclude_none=True) return disease_descriptor - def _add_therapy_descriptor(self, therapy_id, record)\ - -> Optional[ValueObjectDescriptor]: + def _add_therapy_descriptor( + self, therapy_id, record + ) -> Optional[ValueObjectDescriptor]: """Add therapy ID to list of valid or invalid transformations. :param str therapy_id: The CIViC ID for the drug :param dict record: CIViC AID or EID :return: A therapy descriptor """ - therapy_descriptor = \ - self.valid_ids['therapy_descriptors'].get(therapy_id) + therapy_descriptor = self.valid_ids["therapy_descriptors"].get(therapy_id) if therapy_descriptor: return therapy_descriptor else: therapy_descriptor = None - if therapy_id not in self.invalid_ids['therapy_descriptors']: - therapy_descriptor = \ - self._get_therapy_descriptor(record["therapies"][0]) + if therapy_id not in self.invalid_ids["therapy_descriptors"]: + therapy_descriptor = self._get_therapy_descriptor( + record["therapies"][0] + ) if therapy_descriptor: - self.valid_ids['therapy_descriptors'][therapy_id] = \ - therapy_descriptor + self.valid_ids["therapy_descriptors"][ + therapy_id + ] = therapy_descriptor else: - self.invalid_ids['therapy_descriptors'].append(therapy_id) + self.invalid_ids["therapy_descriptors"].append(therapy_id) return therapy_descriptor - def _get_therapy_descriptor(self, drug) \ - -> Optional[ValueObjectDescriptor]: + def _get_therapy_descriptor(self, drug) -> Optional[ValueObjectDescriptor]: """Get a therapy descriptor. :param dict drug: A CIViC drug record @@ -646,25 +703,32 @@ def _get_therapy_descriptor(self, drug) \ else: queries = [label] - therapy_norm_resp, normalized_therapy_id = \ - self.vicc_normalizers.normalize_therapy(queries) + ( + therapy_norm_resp, + normalized_therapy_id, + ) = self.vicc_normalizers.normalize_therapy(queries) if not normalized_therapy_id: - logger.warning(f"Therapy Normalizer unable to normalize: " - f"using queries {ncit_id} and {label}") + logger.warning( + f"Therapy Normalizer unable to normalize: " + f"using queries {ncit_id} and {label}" + ) return None - regulatory_approval_extension = \ + regulatory_approval_extension = ( self.vicc_normalizers.get_regulatory_approval_extension(therapy_norm_resp) + ) therapy_descriptor = ValueObjectDescriptor( id=therapy_id, type="TherapyDescriptor", label=label, therapy_id=normalized_therapy_id, - alternate_labels=drug['aliases'], + alternate_labels=drug["aliases"], xrefs=[ncit_id] if ncit_id else None, - extensions=[regulatory_approval_extension] if regulatory_approval_extension else None # noqa: E501 + extensions=[regulatory_approval_extension] + if regulatory_approval_extension + else None, ).dict(exclude_none=True) return therapy_descriptor @@ -672,44 +736,39 @@ def _add_methods(self) -> None: """Add methods to list of transformations.""" self.methods = [ schemas.Method( - id=f'method:' - f'{schemas.MethodID.CIVIC_EID_SOP}', - label='Standard operating procedure for curation and clinical' - ' interpretation of variants in cancer', - url='https://genomemedicine.biomedcentral.com/articles/' - '10.1186/s13073-019-0687-x', + id=f"method:" f"{schemas.MethodID.CIVIC_EID_SOP}", + label="Standard operating procedure for curation and clinical" + " interpretation of variants in cancer", + url="https://genomemedicine.biomedcentral.com/articles/" + "10.1186/s13073-019-0687-x", version=schemas.Date(year=2019, month=11, day=29).dict(), - authors='Danos, A.M., Krysiak, K., Barnell, E.K. et al.' + authors="Danos, A.M., Krysiak, K., Barnell, E.K. et al.", ).dict(exclude_none=True), schemas.Method( - id=f'method:' - f'{schemas.MethodID.CIVIC_AID_AMP_ASCO_CAP.value}', - label='Standards and Guidelines for the ' - 'Interpretation and Reporting of Sequence ' - 'Variants in Cancer: A Joint Consensus ' - 'Recommendation of the Association ' - 'for Molecular Pathology, American Society of ' - 'Clinical Oncology, and College of American ' - 'Pathologists', - url='https://pubmed.ncbi.nlm.nih.gov/27993330/', - version=schemas.Date(year=2017, - month=1).dict(exclude_none=True), - authors='Li MM, Datto M, Duncavage EJ, et al.' + id=f"method:" f"{schemas.MethodID.CIVIC_AID_AMP_ASCO_CAP.value}", + label="Standards and Guidelines for the " + "Interpretation and Reporting of Sequence " + "Variants in Cancer: A Joint Consensus " + "Recommendation of the Association " + "for Molecular Pathology, American Society of " + "Clinical Oncology, and College of American " + "Pathologists", + url="https://pubmed.ncbi.nlm.nih.gov/27993330/", + version=schemas.Date(year=2017, month=1).dict(exclude_none=True), + authors="Li MM, Datto M, Duncavage EJ, et al.", ).dict(exclude_none=True), schemas.Method( - id=f'method:' - f'{schemas.MethodID.CIVIC_AID_ACMG.value}', - label='Standards and guidelines for the ' - 'interpretation of sequence variants: a ' - 'joint consensus recommendation of the ' - 'American College of Medical Genetics and' - ' Genomics and the Association for ' - 'Molecular Pathology', - url='https://pubmed.ncbi.nlm.nih.gov/25741868/', - version=schemas.Date(year=2015, - month=5).dict(exclude_none=True), - authors='Richards S, Aziz N, Bale S, et al.' - ).dict(exclude_none=True) + id=f"method:" f"{schemas.MethodID.CIVIC_AID_ACMG.value}", + label="Standards and guidelines for the " + "interpretation of sequence variants: a " + "joint consensus recommendation of the " + "American College of Medical Genetics and" + " Genomics and the Association for " + "Molecular Pathology", + url="https://pubmed.ncbi.nlm.nih.gov/25741868/", + version=schemas.Date(year=2015, month=5).dict(exclude_none=True), + authors="Richards S, Aziz N, Bale S, et al.", + ).dict(exclude_none=True), ] def _get_eid_document(self, source) -> Optional[schemas.Document]: @@ -718,21 +777,21 @@ def _get_eid_document(self, source) -> Optional[schemas.Document]: :param dict source: An evidence item's source :return: Document for EID """ - source_type = source['source_type'].upper() + source_type = source["source_type"].upper() if source_type in schemas.SourcePrefix.__members__: prefix = schemas.SourcePrefix[source_type].value document_id = f"{prefix}:{source['citation_id']}" xrefs = [] - if source['asco_abstract_id']: + if source["asco_abstract_id"]: xrefs.append(f"asco.abstract:{source['asco_abstract_id']}") - if source['pmc_id']: + if source["pmc_id"]: xrefs.append(f"pmc:{source['pmc_id']}") document = schemas.Document( id=document_id, - label=source['citation'], + label=source["citation"], description=source["title"], - xrefs=xrefs if xrefs else None + xrefs=xrefs if xrefs else None, ).dict(exclude_none=True) return document else: @@ -750,25 +809,27 @@ def _get_aid_document(self, assertion: Dict) -> List[schemas.Document]: label = nccn_guideline.get("name") version = assertion["nccn_guideline_version"] if label and version: - doc_id = "https://www.nccn.org/professionals/physician_gls/default.aspx" # noqa: E501 + doc_id = "https://www.nccn.org/professionals/physician_gls/default.aspx" doc_label = f"NCCN Guidelines: {label} version {version}" db_id = self._get_document_id(document_id=doc_id, label=doc_label) documents = list() - documents.append(schemas.Document( - id=db_id, - document_id=doc_id, - label=doc_label - ).dict(exclude_none=True)) + documents.append( + schemas.Document(id=db_id, document_id=doc_id, label=doc_label).dict( + exclude_none=True + ) + ) # TODO: Check this after first pass # ACMG Codes - if assertion['acmg_codes']: - for acmg_code in assertion['acmg_codes']: + if assertion["acmg_codes"]: + for acmg_code in assertion["acmg_codes"]: document_id = f"acmg:{acmg_code['code']}" - documents.append(schemas.Document( - id=document_id, - label=acmg_code['code'], - description=acmg_code['description'] - ).dict(exclude_none=True)) + documents.append( + schemas.Document( + id=document_id, + label=acmg_code["code"], + description=acmg_code["description"], + ).dict(exclude_none=True) + ) return documents diff --git a/metakb/transform/moa.py b/metakb/transform/moa.py index bee74e5c..ceba0f36 100644 --- a/metakb/transform/moa.py +++ b/metakb/transform/moa.py @@ -1,38 +1,40 @@ """A module to convert MOA resources to common data model""" -from typing import Optional import logging +from typing import Optional from urllib.parse import quote -from ga4gh.vrsatile.pydantic.vrsatile_models import VariationDescriptor,\ - Extension, GeneDescriptor, ValueObjectDescriptor +from ga4gh.vrsatile.pydantic.vrsatile_models import ( + Extension, + GeneDescriptor, + ValueObjectDescriptor, + VariationDescriptor, +) import metakb.schemas as schemas from metakb.transform.base import Transform -logger = logging.getLogger('metakb.transform.moa') +logger = logging.getLogger("metakb.transform.moa") logger.setLevel(logging.DEBUG) class MOATransform(Transform): """A class for transforming MOA resources to common data model.""" - async def transform(self): + async def transform(self) -> None: """Transform MOA harvested JSON to common date model. Saves output in MOA transform directory. """ data = self.extract_harvester() cdm_assertions = {} # assertions that have been transformed to CDM - assertions = data['assertions'] - sources = data['sources'] - variants = data['variants'] + assertions = data["assertions"] + sources = data["sources"] + variants = data["variants"] # Transform MOA assertions - await self._transform_statements(assertions, variants, sources, - cdm_assertions) + await self._transform_statements(assertions, variants, sources, cdm_assertions) - async def _transform_statements(self, records, variants, sources, - cdm_assertions): + async def _transform_statements(self, records, variants, sources, cdm_assertions): """Add transformed assertions to the response list. :param: A list of MOA assertion records @@ -44,32 +46,42 @@ async def _transform_statements(self, records, variants, sources, """ for record in records: gene_descriptors = self._get_gene_descriptors( - self._get_record(record['variant']['id'], variants)) - descriptors = \ - await self._get_descriptors(record, variants, gene_descriptors) + self._get_record(record["variant"]["id"], variants) + ) + descriptors = await self._get_descriptors( + record, variants, gene_descriptors + ) if not descriptors: continue else: - therapy_descriptors, variation_descriptors, disease_descriptors = descriptors # noqa: E501 - - propositions = \ - self._get_tr_propositions(record, variation_descriptors, - disease_descriptors, - therapy_descriptors) + ( + therapy_descriptors, + variation_descriptors, + disease_descriptors, + ) = descriptors + + propositions = self._get_tr_propositions( + record, variation_descriptors, disease_descriptors, therapy_descriptors + ) # We only want therapeutic response for now if not propositions: continue documents = self._get_documents( - self._get_record(record['source_ids'], sources)) + self._get_record(record["source_ids"], sources) + ) methods = self._get_method() - statements = self._get_statement(record, propositions, - variation_descriptors, - therapy_descriptors, - disease_descriptors, - methods, documents) + statements = self._get_statement( + record, + propositions, + variation_descriptors, + therapy_descriptors, + disease_descriptors, + methods, + documents, + ) response = schemas.Response( statements=statements, @@ -79,15 +91,21 @@ async def _transform_statements(self, records, variants, sources, therapy_descriptors=therapy_descriptors, disease_descriptors=disease_descriptors, methods=methods, - documents=documents + documents=documents, ).dict(by_alias=True, exclude_none=True) cdm_assertions[f"moa:assertion_{record['id']}"] = response - for field in ['statements', 'propositions', - 'variation_descriptors', 'gene_descriptors', - 'therapy_descriptors', 'disease_descriptors', - 'methods', 'documents']: + for field in [ + "statements", + "propositions", + "variation_descriptors", + "gene_descriptors", + "therapy_descriptors", + "disease_descriptors", + "methods", + "documents", + ]: attr = getattr(self, field) var = response[field] for el in var: @@ -105,31 +123,44 @@ async def _get_descriptors(self, record, variants, gene_descriptors): therapy_descriptors = self._get_therapy_descriptors(record) len_td = len(therapy_descriptors) if len_td != 1: - logger.warning(f"Expected 1 therapy_descriptor for" - f" {record['therapy_name']} but found {len_td}") + logger.warning( + f"Expected 1 therapy_descriptor for" + f" {record['therapy_name']} but found {len_td}" + ) return None variation_descriptors = await self._get_variation_descriptors( - self._get_record(record['variant']['id'], variants), - gene_descriptors) + self._get_record(record["variant"]["id"], variants), gene_descriptors + ) len_vd = len(variation_descriptors) if len_vd != 1: - logger.warning(f"Expected 1 variation descriptor for" - f" {record['variant']} but found {len_vd}") + logger.warning( + f"Expected 1 variation descriptor for" + f" {record['variant']} but found {len_vd}" + ) return None disease_descriptors = self._get_disease_descriptors(record) len_dd = len(disease_descriptors) if len_dd != 1: - logger.warning(f"Expected 1 disease descriptor for" - f" {record['disease']} but found {len_dd}") + logger.warning( + f"Expected 1 disease descriptor for" + f" {record['disease']} but found {len_dd}" + ) return None return therapy_descriptors, variation_descriptors, disease_descriptors - def _get_statement(self, record, propositions, variant_descriptors, - therapy_descriptors, disease_descriptors, - methods, documents): + def _get_statement( + self, + record, + propositions, + variant_descriptors, + therapy_descriptors, + disease_descriptors, + methods, + documents, + ): """Get a statement for an assertion. :param dict record: A MOA assertion record @@ -141,26 +172,26 @@ def _get_statement(self, record, propositions, variant_descriptors, :param list documents: Supporting evidence for the rcord :return: A list of statement """ - evidence_level = record['predictive_implication'].strip().replace(' ', '_') # noqa: E501 + evidence_level = record["predictive_implication"].strip().replace(" ", "_") statement = schemas.Statement( id=f"{schemas.SourceName.MOA.value}.assertion:{record['id']}", - description=record['description'], - evidence_level=f"moa.evidence_level:" - f"{evidence_level}", - proposition=propositions[0]['id'], - variation_origin=self._get_variation_origin(record['variant']), - variation_descriptor=variant_descriptors[0]['id'], - therapy_descriptor=therapy_descriptors[0]['id'], - disease_descriptor=disease_descriptors[0]['id'], - method=methods[0]['id'], - supported_by=[se['id'] for se in documents] + description=record["description"], + evidence_level=f"moa.evidence_level:" f"{evidence_level}", + proposition=propositions[0]["id"], + variation_origin=self._get_variation_origin(record["variant"]), + variation_descriptor=variant_descriptors[0]["id"], + therapy_descriptor=therapy_descriptors[0]["id"], + disease_descriptor=disease_descriptors[0]["id"], + method=methods[0]["id"], + supported_by=[se["id"] for se in documents], ).dict(exclude_none=True) return [statement] - def _get_tr_propositions(self, record, variation_descriptors, - disease_descriptors, therapy_descriptors): + def _get_tr_propositions( + self, record, variation_descriptors, disease_descriptors, therapy_descriptors + ): """Return a list of propositions. :param: MOA assertion @@ -169,19 +200,19 @@ def _get_tr_propositions(self, record, variation_descriptors, :param: A list of therapy_descriptors :return: A list of therapeutic propositions. """ - predicate = self._get_predicate(record['clinical_significance']) + predicate = self._get_predicate(record["clinical_significance"]) # Don't support TR that has `None`, 'N/A', or 'Unknown' predicate if not predicate: return [] params = { - 'id': '', - 'type': schemas.PropositionType.PREDICTIVE, - 'predicate': predicate, - 'subject': variation_descriptors[0]['variation_id'], - 'object_qualifier': disease_descriptors[0]['disease_id'], - 'object': therapy_descriptors[0]['therapy_id'] + "id": "", + "type": schemas.PropositionType.PREDICTIVE, + "predicate": predicate, + "subject": variation_descriptors[0]["variation_id"], + "object_qualifier": disease_descriptors[0]["disease_id"], + "object": therapy_descriptors[0]["therapy_id"], } # Get corresponding id for proposition @@ -190,14 +221,14 @@ def _get_tr_propositions(self, record, variation_descriptors, params["predicate"], variation_ids=[params["subject"]], disease_ids=[params["object_qualifier"]], - therapy_ids=[params["object"]] + therapy_ids=[params["object"]], + ) + proposition = schemas.TherapeuticResponseProposition(**params).dict( + exclude_none=True ) - proposition = schemas.TherapeuticResponseProposition( - **params).dict(exclude_none=True) return [proposition] - def _get_predicate(self, - clin_sig) -> Optional[schemas.PredictivePredicate]: + def _get_predicate(self, clin_sig) -> Optional[schemas.PredictivePredicate]: """Get the predicate of this record :param: clinical significance of the assertion @@ -216,9 +247,9 @@ def _get_variation_origin(self, variant): :param: A MOA variant record :return: A str representation of variation origin """ - if variant['feature_type'] == 'somatic_variant': + if variant["feature_type"] == "somatic_variant": origin = schemas.VariationOrigin.SOMATIC.value - elif variant['feature_type'] == 'germline_variant': + elif variant["feature_type"] == "germline_variant": origin = schemas.VariationOrigin.GERMLINE.value else: origin = None @@ -231,36 +262,44 @@ async def _get_variation_descriptors(self, variant, g_descriptors): :param: single assertion record from MOA :return: list of variation descriptor """ - vrs_ref_allele_seq = variant['protein_change'][2] \ - if 'protein_change' in variant and variant['protein_change'] else None # noqa: E501 + vrs_ref_allele_seq = ( + variant["protein_change"][2] + if "protein_change" in variant and variant["protein_change"] + else None + ) variation_descriptor = None # For now, the normalizer only support a.a substitution - if g_descriptors and 'protein_change' in variant and variant['protein_change']: # noqa: E501 - gene = g_descriptors[0]['label'] + if g_descriptors and "protein_change" in variant and variant["protein_change"]: + gene = g_descriptors[0]["label"] query = f"{gene} {variant['protein_change'][2:]}" - variation_descriptor = \ - await self.vicc_normalizers.normalize_variation([query]) + variation_descriptor = await self.vicc_normalizers.normalize_variation( + [query] + ) if not variation_descriptor: - logger.warning(f"Variant Normalizer unable to normalize: " - f"moa.variant:{variant['id']}.") + logger.warning( + f"Variant Normalizer unable to normalize: " + f"moa.variant:{variant['id']}." + ) return [] else: - logger.warning(f"Variation Normalizer does not support " - f"moa.variant:{variant['id']}: {variant}") + logger.warning( + f"Variation Normalizer does not support " + f"moa.variant:{variant['id']}: {variant}" + ) return [] - gene_context = g_descriptors[0]['id'] if g_descriptors else None + gene_context = g_descriptors[0]["id"] if g_descriptors else None variation_descriptor = VariationDescriptor( id=f"moa.variant:{variant['id']}", - label=variant['feature'], + label=variant["feature"], variation_id=variation_descriptor.variation_id, variation=variation_descriptor.variation, gene_context=gene_context, vrs_ref_allele_seq=vrs_ref_allele_seq, - extensions=self._get_variant_extensions(variant) + extensions=self._get_variant_extensions(variant), ).dict(by_alias=True, exclude_none=True) return [variation_descriptor] @@ -270,23 +309,29 @@ def _get_variant_extensions(self, variant): :param dict variant: A MOA variant record :return: A list of extensions """ - coordinate = ['chromosome', 'start_position', 'end_position', - 'reference_allele', 'alternate_allele', - 'cdna_change', 'protein_change', 'exon'] + coordinate = [ + "chromosome", + "start_position", + "end_position", + "reference_allele", + "alternate_allele", + "cdna_change", + "protein_change", + "exon", + ] extensions = [ Extension( - name='moa_representative_coordinate', - value={c: variant[c] for c in coordinate} + name="moa_representative_coordinate", + value={c: variant[c] for c in coordinate}, ).dict(exclude_none=True) ] - if variant['rsid']: + if variant["rsid"]: extensions.append( - Extension( - name='moa_rsid', - value=variant['rsid'] - ).dict(exclude_none=True) + Extension(name="moa_rsid", value=variant["rsid"]).dict( + exclude_none=True + ) ) return extensions @@ -296,25 +341,22 @@ def _get_gene_descriptors(self, variant): :param: A MOA variant record :return: A Gene Descriptor """ - genes = [value for key, value in variant.items() - if key.startswith('gene')] + genes = [value for key, value in variant.items() if key.startswith("gene")] genes = list(filter(None, genes)) - gene_descriptors = [] # for fusion protein, we would include both genes # noqa: E501 + gene_descriptors = [] # for fusion protein, we would include both genes if genes: for gene in genes: - _, normalized_gene_id = \ - self.vicc_normalizers.normalize_gene([gene]) + _, normalized_gene_id = self.vicc_normalizers.normalize_gene([gene]) if normalized_gene_id: gene_descriptor = GeneDescriptor( id=f"{schemas.SourceName.MOA.value}.normalize." - f"{schemas.NormalizerPrefix.GENE.value}:{quote(gene)}", # noqa: E501 + f"{schemas.NormalizerPrefix.GENE.value}:{quote(gene)}", label=gene, gene_id=normalized_gene_id, ).dict(exclude_none=True) else: - logger.warning(f"Gene Normalizer unable to " - f"normalize: {gene}") + logger.warning(f"Gene Normalizer unable to " f"normalize: {gene}") gene_descriptor = {} gene_descriptors.append(gene_descriptor) @@ -327,21 +369,19 @@ def _get_documents(self, source): :param: An evidence source :param: Keeps track of proposition and documents indexes """ - if source['pmid']: + if source["pmid"]: documents_id = f"pmid:{source['pmid']}" else: - documents_id = source['url'] + documents_id = source["url"] xrefs = [] - if source['doi']: + if source["doi"]: xrefs.append(f"doi:{source['doi']}") - if source['nct']: + if source["nct"]: xrefs.append(f"nct:{source['nct']}") documents = schemas.Document( - id=documents_id, - label=source['citation'], - xrefs=xrefs if xrefs else None + id=documents_id, label=source["citation"], xrefs=xrefs if xrefs else None ).dict(exclude_none=True) return [documents] @@ -351,14 +391,15 @@ def _get_method(self): :return: A list of methods """ - methods = [schemas.Method( - id=f'method:' - f'{schemas.MethodID.MOA_ASSERTION_BIORXIV}', - label='Clinical interpretation of integrative molecular profiles to guide precision cancer medicine', # noqa:E501 - url='https://www.biorxiv.org/content/10.1101/2020.09.22.308833v1', # noqa:E501 - version=schemas.Date(year=2020, month=9, day=22), - authors='Reardon, B., Moore, N.D., Moore, N. et al.' - ).dict()] + methods = [ + schemas.Method( + id=f"method:{schemas.MethodID.MOA_ASSERTION_BIORXIV}", + label="Clinical interpretation of integrative molecular profiles to guide precision cancer medicine", + url="https://www.biorxiv.org/content/10.1101/2020.09.22.308833v1", + version=schemas.Date(year=2020, month=9, day=22), + authors="Reardon, B., Moore, N.D., Moore, N. et al.", + ).dict() + ] return methods @@ -368,28 +409,35 @@ def _get_therapy_descriptors(self, assertion): :param: an MOA assertion record :return: A list of Therapy Descriptors """ - label = assertion['therapy_name'] + label = assertion["therapy_name"] if not label: return [] - therapy_norm_resp, normalized_therapy_id = \ - self.vicc_normalizers.normalize_therapy([label]) + ( + therapy_norm_resp, + normalized_therapy_id, + ) = self.vicc_normalizers.normalize_therapy([label]) if not normalized_therapy_id: logger.warning(f"Therapy Normalizer unable to normalize: {label}") return [] if normalized_therapy_id: - regulatory_approval_extension = \ - self.vicc_normalizers.get_regulatory_approval_extension(therapy_norm_resp) # noqa: E501 + regulatory_approval_extension = ( + self.vicc_normalizers.get_regulatory_approval_extension( + therapy_norm_resp + ) + ) therapy_descriptor = ValueObjectDescriptor( id=f"{schemas.SourceName.MOA.value}." - f"{therapy_norm_resp.therapy_descriptor.id}", + f"{therapy_norm_resp.therapy_descriptor.id}", type="TherapyDescriptor", label=label, therapy_id=normalized_therapy_id, - extensions=[regulatory_approval_extension] if regulatory_approval_extension else None # noqa: E501 + extensions=[regulatory_approval_extension] + if regulatory_approval_extension + else None, ).dict(exclude_none=True) else: return [] @@ -402,22 +450,26 @@ def _get_disease_descriptors(self, assertion): :param: an MOA assertion record :return: A list of Therapy Descriptors """ - ot_code = assertion['disease']['oncotree_code'] + ot_code = assertion["disease"]["oncotree_code"] if ot_code: ot_code = f"oncotree:{ot_code}" - disease_name = assertion['disease']['name'] + disease_name = assertion["disease"]["name"] - disease_norm_resp, normalized_disease_id = \ - self.vicc_normalizers.normalize_disease([ot_code, disease_name]) + ( + disease_norm_resp, + normalized_disease_id, + ) = self.vicc_normalizers.normalize_disease([ot_code, disease_name]) if not normalized_disease_id: - logger.warning(f"Disease Normalize unable to normalize: " - f"{ot_code} and {disease_name}") + logger.warning( + f"Disease Normalize unable to normalize: " + f"{ot_code} and {disease_name}" + ) return [] disease_descriptor = ValueObjectDescriptor( id=f"{schemas.SourceName.MOA.value}." - f"{disease_norm_resp.disease_descriptor.id}", + f"{disease_norm_resp.disease_descriptor.id}", type="DiseaseDescriptor", label=disease_name, disease_id=normalized_disease_id, @@ -432,5 +484,5 @@ def _get_record(self, record_id, records): :param: A dict of records for a given MOA record type """ for r in records: - if r['id'] == record_id: + if r["id"] == record_id: return r diff --git a/metakb/transform/oncokb.py b/metakb/transform/oncokb.py index 37ccc07a..007d2731 100644 --- a/metakb/transform/oncokb.py +++ b/metakb/transform/oncokb.py @@ -1,19 +1,31 @@ """A module for transforming OncoKB to common data model (CDM)""" -from typing import Optional, Dict, List +import logging +from copy import deepcopy from pathlib import Path +from typing import Dict, List, Optional from urllib.parse import quote -from copy import deepcopy -import logging -from ga4gh.vrsatile.pydantic.vrsatile_models import VariationDescriptor, \ - Extension, GeneDescriptor, ValueObjectDescriptor +from ga4gh.vrsatile.pydantic.vrsatile_models import ( + Extension, + GeneDescriptor, + ValueObjectDescriptor, + VariationDescriptor, +) from metakb import APP_ROOT from metakb.normalizers import VICCNormalizers +from metakb.schemas import ( + Date, + DiagnosticPredicate, + Document, + Method, + MethodID, + Predicate, + PredictivePredicate, + PropositionType, + Statement, +) from metakb.transform.base import Transform -from metakb.schemas import Date, DiagnosticPredicate, Document, Method, MethodID, \ - Predicate, PredictivePredicate, PropositionType, Statement - logger = logging.getLogger("metakb.transform.oncokb") logger.setLevel(logging.DEBUG) @@ -29,7 +41,7 @@ ("highestSensitiveLevel", "oncokb_highest_sensitive_level"), ("highestResistanceLevel", "oncokb_highest_resistance_level"), ("background", "oncokb_background"), - ("tsg", "tumor_suppressor_gene") + ("tsg", "tumor_suppressor_gene"), ] @@ -41,10 +53,16 @@ ("vus", "vus"), ("highestSensitiveLevel", "oncokb_highest_sensitive_level"), ("highestResistanceLevel", "oncokb_highest_resistance_level"), - ("highestDiagnosticImplicationLevel", "oncokb_highest_diagnostic_implication_level"), # noqa: E501 - ("highestPrognosticImplicationLevel", "oncokb_highest_prognostic_implication_level"), # noqa: E501 + ( + "highestDiagnosticImplicationLevel", + "oncokb_highest_diagnostic_implication_level", + ), + ( + "highestPrognosticImplicationLevel", + "oncokb_highest_prognostic_implication_level", + ), ("highestFdaLevel", "oncokb_highest_fda_level"), - ("alleleExist", "allele_exist") + ("alleleExist", "allele_exist"), ] @@ -55,7 +73,7 @@ ("children", "children"), ("parent", "parent"), ("level", "level"), - ("tumorForm", "tumor_form") + ("tumorForm", "tumor_form"), ] @@ -65,8 +83,10 @@ class OncoKBTransform(Transform): method = f"method:{MethodID.ONCOKB_SOP}" def __init__( - self, data_dir: Path = APP_ROOT / "data", harvester_path: Optional[Path] = None, - normalizers: Optional[VICCNormalizers] = None + self, + data_dir: Path = APP_ROOT / "data", + harvester_path: Optional[Path] = None, + normalizers: Optional[VICCNormalizers] = None, ) -> None: """Initialize OncoKB Transform class. @@ -76,22 +96,18 @@ def __init__( """ super().__init__(data_dir, harvester_path, normalizers) # Able to normalize these IDSs - self.valid_ids = { - "disease_descriptors": dict(), - "therapy_descriptors": dict() - } + self.valid_ids = {"disease_descriptors": dict(), "therapy_descriptors": dict()} # Unable to normalize these IDs - self.invalid_ids = { - "therapy_descriptors": set(), - "disease_descriptors": set() - } + self.invalid_ids = {"therapy_descriptors": set(), "disease_descriptors": set()} self.methods = [ - Method(id=f"method:{MethodID.ONCOKB_SOP}", - label="OncoKB Curation Standard Operating Procedure", - url="https://sop.oncokb.org/", - version=Date(year=2021, month=11).dict(), - authors="OncoKB").dict(exclude_none=True) + Method( + id=f"method:{MethodID.ONCOKB_SOP}", + label="OncoKB Curation Standard Operating Procedure", + url="https://sop.oncokb.org/", + version=Date(year=2021, month=11).dict(), + authors="OncoKB", + ).dict(exclude_none=True) ] async def transform(self) -> None: @@ -124,8 +140,17 @@ async def _transform_evidence(self, variants_data: List[Dict]) -> None: for data in variants_data: # Exclude trying on variants we know we can't normalize unable_to_normalize_variant = { - "fusion", "fusions", "mutation", "mutations", "tandem", "domain", - "splice", "deletion", "hypermethylation", "silencing", "overexpression" + "fusion", + "fusions", + "mutation", + "mutations", + "tandem", + "domain", + "splice", + "deletion", + "hypermethylation", + "silencing", + "overexpression", } alt = data["query"]["alteration"] @@ -153,9 +178,15 @@ async def _transform_evidence(self, variants_data: List[Dict]) -> None: self._add_therapeutic_evidence(treatment, variation_descriptor) def _add_evidence( - self, evidence_data: Dict, proposition_type: PropositionType, level: str, - predicate: Predicate, disease_data: Dict, variation_descriptor: Dict, - extensions: Optional[List] = None, therapy_descriptor: Optional[Dict] = None + self, + evidence_data: Dict, + proposition_type: PropositionType, + level: str, + predicate: Predicate, + disease_data: Dict, + variation_descriptor: Dict, + extensions: Optional[List] = None, + therapy_descriptor: Optional[Dict] = None, ) -> None: """Add transformed oncokb evidence as statements Will update instance variables (disease_descriptors, proposition, documents, @@ -177,9 +208,12 @@ def _add_evidence( return None proposition = self._get_proposition( - proposition_type, predicate, variation_descriptor["variation_id"], + proposition_type, + predicate, + variation_descriptor["variation_id"], disease_descriptor["disease_id"], - therapy_descriptor["therapy_id"] if therapy_descriptor else None) + therapy_descriptor["therapy_id"] if therapy_descriptor else None, + ) if proposition: documents = self._get_documents(evidence_data["pmids"]) description = evidence_data["description"] @@ -190,19 +224,23 @@ def _add_evidence( "proposition": proposition["id"], "variation_descriptor": variation_descriptor["id"], "disease_descriptor": disease_descriptor["id"], - "therapy_descriptor": therapy_descriptor["id"] if therapy_descriptor else None, # noqa: E501 + "therapy_descriptor": therapy_descriptor["id"] + if therapy_descriptor + else None, "method": self.methods[0]["id"], "supported_by": [d["id"] for d in documents], - "extensions": extensions + "extensions": extensions, } digest = self._generate_digest(statement_params) statement_params["id"] = f"oncokb.evidence:{digest}" - statement = Statement( - **statement_params).dict(by_alias=True, exclude_none=True) + statement = Statement(**statement_params).dict( + by_alias=True, exclude_none=True + ) self.statements.append(statement) - def _add_diagnostic_evidence(self, diagnostic_implication: Dict, - variation_descriptor: Dict) -> None: + def _add_diagnostic_evidence( + self, diagnostic_implication: Dict, variation_descriptor: Dict + ) -> None: """Transform OncoKB Diagnostic Evidence to common data model. Will update instance variables (statements, propositions, variation_descriptors, gene_descriptors, therapy_descriptors, disease_descriptors, documents) with @@ -216,11 +254,18 @@ def _add_diagnostic_evidence(self, diagnostic_implication: Dict, proposition_type = PropositionType.DIAGNOSTIC level = diagnostic_implication["levelOfEvidence"] disease_data = diagnostic_implication["tumorType"] - self._add_evidence(diagnostic_implication, proposition_type, level, predicate, - disease_data, variation_descriptor) - - def _add_therapeutic_evidence(self, treatment: Dict, - variation_descriptor: Dict) -> None: + self._add_evidence( + diagnostic_implication, + proposition_type, + level, + predicate, + disease_data, + variation_descriptor, + ) + + def _add_therapeutic_evidence( + self, treatment: Dict, variation_descriptor: Dict + ) -> None: """Transform OncoKB Therapeutic Evidence to common data model. Will update instance variables (statements, propositions, variation_descriptors, gene_descriptors, therapy_descriptors, disease_descriptors, documents) with @@ -251,15 +296,19 @@ def _add_therapeutic_evidence(self, treatment: Dict, extensions = list() fda_level = treatment["fdaLevel"] if fda_level: - ext_value = { - "level": fda_level, - "description": self.fda_levels[fda_level] - - } + ext_value = {"level": fda_level, "description": self.fda_levels[fda_level]} extensions.append(Extension(name="onckb_fda_level", value=ext_value).dict()) - self._add_evidence(treatment, proposition_type, level, predicate, disease_data, - variation_descriptor, extensions, therapy_descriptor) + self._add_evidence( + treatment, + proposition_type, + level, + predicate, + disease_data, + variation_descriptor, + extensions, + therapy_descriptor, + ) def _add_therapy_descriptor(self, drugs_data: List[Dict]) -> Optional[Dict]: """Get therapy descriptor @@ -282,7 +331,9 @@ def _add_therapy_descriptor(self, drugs_data: List[Dict]) -> Optional[Dict]: if ncit_code not in self.invalid_ids["therapy_descriptors"]: therapy_descriptor = self._get_therapy_descriptor(drugs_data) if therapy_descriptor: - self.valid_ids["therapy_descriptors"][ncit_code] = therapy_descriptor # noqa: E501 + self.valid_ids["therapy_descriptors"][ + ncit_code + ] = therapy_descriptor self.therapy_descriptors.append(therapy_descriptor) else: self.invalid_ids["therapy_descriptors"].add(ncit_code) @@ -302,14 +353,19 @@ def _get_therapy_descriptor(self, drugs_data: List[Dict]) -> Optional[Dict]: label = drug["drugName"] queries = [ncit_id, label] - therapy_norm_resp, normalized_therapy_id = \ - self.vicc_normalizers.normalize_therapy(queries) + ( + therapy_norm_resp, + normalized_therapy_id, + ) = self.vicc_normalizers.normalize_therapy(queries) if not normalized_therapy_id: - logger.warning(f"Therapy Normalizer unable to normalize using queries: {queries}") # noqa: E501 + logger.warning( + f"Therapy Normalizer unable to normalize using queries: {queries}" + ) return None - regulatory_approval_extension = \ + regulatory_approval_extension = ( self.vicc_normalizers.get_regulatory_approval_extension(therapy_norm_resp) + ) return ValueObjectDescriptor( type="TherapyDescriptor", @@ -318,7 +374,9 @@ def _get_therapy_descriptor(self, drugs_data: List[Dict]) -> Optional[Dict]: therapy_id=normalized_therapy_id, alternate_labels=drug["synonyms"] if drug["synonyms"] else None, xrefs=[ncit_id], - extensions=[regulatory_approval_extension] if regulatory_approval_extension else None # noqa: E501 + extensions=[regulatory_approval_extension] + if regulatory_approval_extension + else None, ).dict(exclude_none=True) def _add_disease_descriptor(self, disease_data: Dict) -> Optional[Dict]: @@ -339,8 +397,9 @@ def _add_disease_descriptor(self, disease_data: Dict) -> Optional[Dict]: if disease_id not in self.invalid_ids["disease_descriptors"]: disease_descriptor = self._get_disease_descriptor(disease_data) if disease_descriptor: - self.valid_ids["disease_descriptors"][disease_id] = \ - disease_descriptor + self.valid_ids["disease_descriptors"][ + disease_id + ] = disease_descriptor self.disease_descriptors.append(disease_descriptor) else: self.invalid_ids["disease_descriptors"].add(disease_id) @@ -359,8 +418,10 @@ def _get_disease_descriptor(self, disease_data: Dict) -> Optional[Dict]: queries = [oncotree_code, label] _, normalized_disease_id = self.vicc_normalizers.normalize_disease(queries) if not normalized_disease_id: - logger.warning(f"Disease Normalizer unable to normalize: " - f"{oncokb_disease_id} using queries {queries}") + logger.warning( + f"Disease Normalizer unable to normalize: " + f"{oncokb_disease_id} using queries {queries}" + ) return None extensions = list() @@ -379,7 +440,7 @@ def _get_disease_descriptor(self, disease_data: Dict) -> Optional[Dict]: label=label, disease_id=normalized_disease_id, xrefs=[oncotree_code], - extensions=extensions if extensions else None + extensions=extensions if extensions else None, ).dict(exclude_none=True) return disease_descriptor @@ -417,7 +478,7 @@ def _add_gene_descriptors(self, genes: List[Dict]) -> None: gene_id=normalized_gene_id, description=gene["summary"] if gene["summary"] else None, extensions=extensions if extensions else None, - xrefs=xrefs if xrefs else None + xrefs=xrefs if xrefs else None, ).dict(exclude_none=True) self.gene_descriptors.append(gene_descriptor) else: @@ -440,7 +501,8 @@ async def _add_variation_descriptor(self, data: Dict) -> Optional[Dict]: variant = f"{gene} {alteration}" variation_descriptor = await self.vicc_normalizers.normalize_variation( - [variant]) + [variant] + ) if not variation_descriptor: logger.warning(f"Variation Normalizer unable to normalize: {variant}") @@ -458,7 +520,7 @@ async def _add_variation_descriptor(self, data: Dict) -> Optional[Dict]: variation_id=variation_descriptor.variation_id, variation=variation_descriptor.variation, gene_context=f"oncokb.normalize.gene:{query['hugoSymbol']}", - extensions=extensions if extensions else None + extensions=extensions if extensions else None, ).dict(by_alias=True, exclude_none=True) self.variation_descriptors.append(vd) return vd @@ -471,10 +533,9 @@ def _get_documents(self, pmids: List[str]) -> List[dict]: """ documents = list() for pmid in pmids: - document = Document( - id=f"pmid:{pmid}", - label=f"PubMed {pmid}" - ).dict(exclude_none=True) + document = Document(id=f"pmid:{pmid}", label=f"PubMed {pmid}").dict( + exclude_none=True + ) documents.append(document) if document not in self.documents: self.documents.append(document) diff --git a/pyproject.toml b/pyproject.toml index 68225ac2..bde679a3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,6 +4,7 @@ build-backend = "setuptools.build_meta" [tool.ruff] src = ["metakb"] +exclude = ["codebuild/*", "docs/*"] # pycodestyle (E, W) # Pyflakes (F) # flake8-annotations (ANN) @@ -43,3 +44,5 @@ ignore = [ # N805 - invalid-first-argument-name-for-method "tests/*" = ["ANN001", "ANN102", "ANN2"] "metakb/schemas.py" = ["ANN001", "ANN201", "N805"] +"analysis/*" = ["ANN001", "ANN201"] +"metakb/*" = ["ANN"] diff --git a/setup.py b/setup.py index 67ff5cca..ee299138 100644 --- a/setup.py +++ b/setup.py @@ -1,5 +1,5 @@ """Module for package and distribution.""" from setuptools import setup -exec(open('metakb/version.py').read()) +exec(open("metakb/version.py").read()) setup(version=__version__) # noqa: F821 diff --git a/tests/conftest.py b/tests/conftest.py index 369dc65b..ae29d427 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,11 +1,10 @@ """Module for pytest fixtures.""" +import asyncio import os import pytest -import asyncio - -from metakb.query import QueryHandler from metakb.normalizers import VICCNormalizers +from metakb.query import QueryHandler from metakb.schemas import SourceName @@ -23,7 +22,7 @@ def civic_eid2997_statement(): return { "id": "civic.eid:2997", "type": "Statement", - "description": "Afatinib, an irreversible inhibitor of the ErbB family of tyrosine kinases has been approved in the US for the first-line treatment of patients with metastatic non-small-cell lung cancer (NSCLC) who have tumours with EGFR exon 19 deletions or exon 21 (L858R) substitution mutations as detected by a US FDA-approved test", # noqa: E501 + "description": "Afatinib, an irreversible inhibitor of the ErbB family of tyrosine kinases has been approved in the US for the first-line treatment of patients with metastatic non-small-cell lung cancer (NSCLC) who have tumours with EGFR exon 19 deletions or exon 21 (L858R) substitution mutations as detected by a US FDA-approved test", "direction": "supports", "evidence_level": "civic.evidence_level:A", "proposition": "proposition:Zfp_VG0uvxwteCcJYO6_AJv1KDmJlFjs", @@ -32,7 +31,7 @@ def civic_eid2997_statement(): "therapy_descriptor": "civic.tid:146", "disease_descriptor": "civic.did:8", "method": "method:1", - "supported_by": ["pmid:23982599"] + "supported_by": ["pmid:23982599"], } @@ -64,27 +63,22 @@ def civic_vid33(): "interval": { "end": {"value": 858, "type": "Number"}, "start": {"value": 857, "type": "Number"}, - "type": "SequenceInterval" + "type": "SequenceInterval", }, "sequence_id": "ga4gh:SQ.vyo55F6mA6n2LgN4cagcdRzOuh38V4mE", - "type": "SequenceLocation" - }, - "state": { - "sequence": "R", - "type": "LiteralSequenceExpression" + "type": "SequenceLocation", }, - "type": "Allele" + "state": {"sequence": "R", "type": "LiteralSequenceExpression"}, + "type": "Allele", }, "xrefs": [ "clinvar:376280", "clinvar:376282", "clinvar:16609", "caid:CA126713", - "dbsnp:121434568" - ], - "alternate_labels": [ - "LEU858ARG" + "dbsnp:121434568", ], + "alternate_labels": ["LEU858ARG"], "extensions": [ { "name": "civic_representative_coordinate", @@ -97,9 +91,9 @@ def civic_vid33(): "representative_transcript": "ENST00000275493.2", "ensembl_version": 75, "reference_build": "GRCh37", - "type": "coordinates" + "type": "coordinates", }, - "type": "Extension" + "type": "Extension", } ], "structural_type": "SO:0001583", @@ -107,25 +101,25 @@ def civic_vid33(): { "syntax": "hgvs.p", "value": "NP_005219.2:p.Leu858Arg", - "type": "Expression" + "type": "Expression", }, { "syntax": "hgvs.c", "value": "ENST00000275493.2:c.2573T>G", - "type": "Expression" + "type": "Expression", }, { "syntax": "hgvs.c", "value": "NM_005228.4:c.2573T>G", - "type": "Expression" + "type": "Expression", }, { "syntax": "hgvs.g", "value": "NC_000007.13:g.55259515T>G", - "type": "Expression" - } + "type": "Expression", + }, ], - "gene_context": "civic.gid:19" + "gene_context": "civic.gid:19", } @@ -136,7 +130,7 @@ def civic_gid19(): "id": "civic.gid:19", "type": "GeneDescriptor", "label": "EGFR", - "description": "EGFR is widely recognized for its importance in cancer. Amplification and mutations have been shown to be driving events in many cancer types. Its role in non-small cell lung cancer, glioblastoma and basal-like breast cancers has spurred many research and drug development efforts. Tyrosine kinase inhibitors have shown efficacy in EGFR amplfied tumors, most notably gefitinib and erlotinib. Mutations in EGFR have been shown to confer resistance to these drugs, particularly the variant T790M, which has been functionally characterized as a resistance marker for both of these drugs. The later generation TKI's have seen some success in treating these resistant cases, and targeted sequencing of the EGFR locus has become a common practice in treatment of non-small cell lung cancer. Overproduction of ligands is another possible mechanism of activation of EGFR. ERBB ligands include EGF, TGF-a, AREG, EPG, BTC, HB-EGF, EPR and NRG1-4 (for detailed information please refer to the respective ligand section).", # noqa: E501 + "description": "EGFR is widely recognized for its importance in cancer. Amplification and mutations have been shown to be driving events in many cancer types. Its role in non-small cell lung cancer, glioblastoma and basal-like breast cancers has spurred many research and drug development efforts. Tyrosine kinase inhibitors have shown efficacy in EGFR amplfied tumors, most notably gefitinib and erlotinib. Mutations in EGFR have been shown to confer resistance to these drugs, particularly the variant T790M, which has been functionally characterized as a resistance marker for both of these drugs. The later generation TKI's have seen some success in treating these resistant cases, and targeted sequencing of the EGFR locus has become a common practice in treatment of non-small cell lung cancer. Overproduction of ligands is another possible mechanism of activation of EGFR. ERBB ligands include EGF, TGF-a, AREG, EPG, BTC, HB-EGF, EPR and NRG1-4 (for detailed information please refer to the respective ligand section).", "gene_id": "hgnc:3236", "alternate_labels": [ "EGFR", @@ -146,11 +140,9 @@ def civic_gid19(): "HER1", "NISBD2", "PIG61", - "mENA" + "mENA", ], - "xrefs": [ - "ncbigene:1956" - ] + "xrefs": ["ncbigene:1956"], } @@ -165,11 +157,9 @@ def civic_tid146(): "alternate_labels": [ "BIBW2992", "BIBW 2992", - "(2e)-N-(4-(3-Chloro-4-Fluoroanilino)-7-(((3s)-Oxolan-3-yl)Oxy)Quinoxazolin-6-yl)-4-(Dimethylamino)But-2-Enamide" # noqa: E501 - ], - "xrefs": [ - "ncit:C66940" + "(2e)-N-(4-(3-Chloro-4-Fluoroanilino)-7-(((3s)-Oxolan-3-yl)Oxy)Quinoxazolin-6-yl)-4-(Dimethylamino)But-2-Enamide", ], + "xrefs": ["ncit:C66940"], "extensions": [ { "type": "Extension", @@ -181,18 +171,18 @@ def civic_tid146(): "id": "hemonc:25316", "type": "DiseaseDescriptor", "label": "Non-small cell lung cancer squamous", - "disease_id": None + "disease_id": None, }, { "id": "hemonc:642", "type": "DiseaseDescriptor", "label": "Non-small cell lung cancer", - "disease_id": "ncit:C2926" - } - ] - } + "disease_id": "ncit:C2926", + }, + ], + }, } - ] + ], } @@ -204,9 +194,7 @@ def civic_did8(): "type": "DiseaseDescriptor", "label": "Lung Non-small Cell Carcinoma", "disease_id": "ncit:C2926", - "xrefs": [ - "DOID:3908" - ] + "xrefs": ["DOID:3908"], } @@ -217,7 +205,7 @@ def pmid_23982599(): "id": "pmid:23982599", "type": "Document", "label": "Dungo et al., 2013, Drugs", - "description": "Afatinib: first global approval." + "description": "Afatinib: first global approval.", } @@ -226,7 +214,7 @@ def civic_eid1409_statement(): """Create test fixture for CIViC Evidence 1406.""" return { "id": "civic.eid:1409", - "description": "Phase 3 randomized clinical trial comparing vemurafenib with dacarbazine in 675 patients with previously untreated, metastatic melanoma with the BRAF V600E mutation. At 6 months, overall survival was 84% (95% confidence interval [CI], 78 to 89) in the vemurafenib group and 64% (95% CI, 56 to 73) in the dacarbazine group. A relative reduction of 63% in the risk of death and of 74% in the risk of either death or disease progression was observed with vemurafenib as compared with dacarbazine (P<0.001 for both comparisons).", # noqa: E501 + "description": "Phase 3 randomized clinical trial comparing vemurafenib with dacarbazine in 675 patients with previously untreated, metastatic melanoma with the BRAF V600E mutation. At 6 months, overall survival was 84% (95% confidence interval [CI], 78 to 89) in the vemurafenib group and 64% (95% CI, 56 to 73) in the dacarbazine group. A relative reduction of 63% in the risk of death and of 74% in the risk of either death or disease progression was observed with vemurafenib as compared with dacarbazine (P<0.001 for both comparisons).", "direction": "supports", "evidence_level": "civic.evidence_level:A", "proposition": "proposition:wsW_PurZodw_qHg1Iw8iAR1CUQte1CLA", @@ -236,7 +224,7 @@ def civic_eid1409_statement(): "disease_descriptor": "civic.did:206", "method": "method:1", "supported_by": ["pmid:21639808"], - "type": "Statement" + "type": "Statement", } @@ -245,7 +233,7 @@ def civic_aid6_statement(): """Create CIViC AID 6 test fixture.""" return { "id": "civic.aid:6", - "description": "L858R is among the most common sensitizing EGFR mutations in NSCLC, and is assessed via DNA mutational analysis, including Sanger sequencing and next generation sequencing methods. Tyrosine kinase inhibitor afatinib is FDA approved as a first line systemic therapy in NSCLC with sensitizing EGFR mutation.", # noqa: E501 + "description": "L858R is among the most common sensitizing EGFR mutations in NSCLC, and is assessed via DNA mutational analysis, including Sanger sequencing and next generation sequencing methods. Tyrosine kinase inhibitor afatinib is FDA approved as a first line systemic therapy in NSCLC with sensitizing EGFR mutation.", "direction": "supports", "evidence_level": "amp_asco_cap_2017_level:1A", "proposition": "proposition:Zfp_VG0uvxwteCcJYO6_AJv1KDmJlFjs", @@ -255,12 +243,15 @@ def civic_aid6_statement(): "disease_descriptor": "civic.did:8", "method": "method:2", "supported_by": [ - "document:9WsQBGXOmTFRXBUanTaIec8Gvgg8bsMA", "civic.eid:2997", - "civic.eid:2629", "civic.eid:982", - "civic.eid:968", "civic.eid:883", - "civic.eid:879" + "document:9WsQBGXOmTFRXBUanTaIec8Gvgg8bsMA", + "civic.eid:2997", + "civic.eid:2629", + "civic.eid:982", + "civic.eid:968", + "civic.eid:883", + "civic.eid:879", ], - "type": "Statement" + "type": "Statement", } @@ -270,9 +261,9 @@ def civic_aid6_document(): return { "id": "document:9WsQBGXOmTFRXBUanTaIec8Gvgg8bsMA", "document_id": "https://www.nccn.org/professionals/" - "physician_gls/default.aspx", + "physician_gls/default.aspx", "label": "NCCN Guidelines: Non-Small Cell Lung Cancer version 3.2018", - "type": "Document" + "type": "Document", } @@ -282,7 +273,7 @@ def civic_eid2_statement(): return { "id": "civic.eid:2", "type": "Statement", - "description": "GIST tumors harboring PDGFRA D842V mutation are more likely to be benign than malignant.", # noqa: E501 + "description": "GIST tumors harboring PDGFRA D842V mutation are more likely to be benign than malignant.", "direction": "supports", "evidence_level": "civic.evidence_level:B", "proposition": "proposition:KVuJMXiPm-oK4vvijE9Cakvucayay3jE", @@ -290,7 +281,7 @@ def civic_eid2_statement(): "variation_descriptor": "civic.vid:99", "disease_descriptor": "civic.did:2", "method": "method:1", - "supported_by": ["pmid:15146165"] + "supported_by": ["pmid:15146165"], } @@ -302,7 +293,7 @@ def civic_eid2_proposition(): "type": "diagnostic_proposition", "predicate": "is_diagnostic_exclusion_criterion_for", "subject": "ga4gh:VA.bjWVYvXPaPbIRAfZvE0Uw_P-i36PGkAz", - "object_qualifier": "ncit:C3868" + "object_qualifier": "ncit:C3868", } @@ -321,25 +312,16 @@ def civic_vid99(): "interval": { "start": {"value": 841, "type": "Number"}, "end": {"value": 842, "type": "Number"}, - "type": "SequenceInterval" + "type": "SequenceInterval", }, "sequence_id": "ga4gh:SQ.XpQn9sZLGv_GU3uiWO7YHq9-_alGjrVX", - "type": "SequenceLocation" + "type": "SequenceLocation", }, - "state": { - "sequence": "V", - "type": "LiteralSequenceExpression" - }, - "type": "Allele" + "state": {"sequence": "V", "type": "LiteralSequenceExpression"}, + "type": "Allele", }, - "xrefs": [ - "clinvar:13543", - "caid:CA123194", - "dbsnp:121908585" - ], - "alternate_labels": [ - "ASP842VAL" - ], + "xrefs": ["clinvar:13543", "caid:CA123194", "dbsnp:121908585"], + "alternate_labels": ["ASP842VAL"], "extensions": [ { "name": "civic_representative_coordinate", @@ -352,9 +334,9 @@ def civic_vid99(): "representative_transcript": "ENST00000257290.5", "ensembl_version": 75, "reference_build": "GRCh37", - "type": "coordinates" + "type": "coordinates", }, - "type": "Extension" + "type": "Extension", } ], "structural_type": "SO:0001583", @@ -362,25 +344,25 @@ def civic_vid99(): { "syntax": "hgvs.c", "value": "NM_006206.4:c.2525A>T", - "type": "Expression" + "type": "Expression", }, { "syntax": "hgvs.p", "value": "NP_006197.1:p.Asp842Val", - "type": "Expression" + "type": "Expression", }, { "syntax": "hgvs.c", "value": "ENST00000257290.5:c.2525A>T", - "type": "Expression" + "type": "Expression", }, { "syntax": "hgvs.g", "value": "NC_000004.11:g.55152093A>T", - "type": "Expression" - } + "type": "Expression", + }, ], - "gene_context": "civic.gid:38" + "gene_context": "civic.gid:38", } @@ -392,9 +374,7 @@ def civic_did2(): "type": "DiseaseDescriptor", "label": "Gastrointestinal Stromal Tumor", "disease_id": "ncit:C3868", - "xrefs": [ - "DOID:9253" - ] + "xrefs": ["DOID:9253"], } @@ -405,17 +385,10 @@ def civic_gid38(): "id": "civic.gid:38", "type": "GeneDescriptor", "label": "PDGFRA", - "description": "Commonly mutated in GI tract tumors, PDGFR family genes (mutually exclusive to KIT mutations) are a hallmark of gastrointestinal stromal tumors. Gene fusions involving the PDGFRA kinase domain are highly correlated with eosinophilia, and the WHO classifies myeloid and lymphoid neoplasms with these characteristics as a distinct disorder. Mutations in the 842 region of PDGFRA have been often found to confer resistance to the tyrosine kinase inhibitor, imatinib.", # noqa: E501 + "description": "Commonly mutated in GI tract tumors, PDGFR family genes (mutually exclusive to KIT mutations) are a hallmark of gastrointestinal stromal tumors. Gene fusions involving the PDGFRA kinase domain are highly correlated with eosinophilia, and the WHO classifies myeloid and lymphoid neoplasms with these characteristics as a distinct disorder. Mutations in the 842 region of PDGFRA have been often found to confer resistance to the tyrosine kinase inhibitor, imatinib.", "gene_id": "hgnc:8803", - "alternate_labels": [ - "PDGFRA", - "PDGFR2", - "PDGFR-2", - "CD140A" - ], - "xrefs": [ - "ncbigene:5156" - ] + "alternate_labels": ["PDGFRA", "PDGFR2", "PDGFR-2", "CD140A"], + "xrefs": ["ncbigene:5156"], } @@ -424,7 +397,7 @@ def civic_eid74_statement(): """Create a test fixture for CIViC EID74 statement.""" return { "id": "civic.eid:74", - "description": "In patients with medullary carcinoma, the presence of RET M918T mutation is associated with increased probability of lymph node metastases.", # noqa: E501 + "description": "In patients with medullary carcinoma, the presence of RET M918T mutation is associated with increased probability of lymph node metastases.", "direction": "supports", "evidence_level": "civic.evidence_level:B", "proposition": "proposition:Vyzbpg-s6mw27yJfYBFxGyQeuEJacP4l", @@ -433,7 +406,7 @@ def civic_eid74_statement(): "disease_descriptor": "civic.did:15", "method": "method:1", "supported_by": ["pmid:18073307"], - "type": "Statement" + "type": "Statement", } @@ -445,7 +418,7 @@ def civic_eid74_proposition(): "type": "diagnostic_proposition", "predicate": "is_diagnostic_inclusion_criterion_for", "subject": "ga4gh:VA.GweduWrfxV58YnSvUBfHPGOA-KCH_iIl", - "object_qualifier": "ncit:C3879" + "object_qualifier": "ncit:C3879", } @@ -464,25 +437,16 @@ def civic_vid113(): "interval": { "end": {"value": 918, "type": "Number"}, "start": {"value": 917, "type": "Number"}, - "type": "SequenceInterval" + "type": "SequenceInterval", }, "sequence_id": "ga4gh:SQ.jMu9-ItXSycQsm4hyABeW_UfSNRXRVnl", - "type": "SequenceLocation" - }, - "state": { - "sequence": "T", - "type": "LiteralSequenceExpression" + "type": "SequenceLocation", }, - "type": "Allele" + "state": {"sequence": "T", "type": "LiteralSequenceExpression"}, + "type": "Allele", }, - "xrefs": [ - "clinvar:13919", - "caid:CA009082", - "dbsnp:74799832" - ], - "alternate_labels": [ - "MET918THR" - ], + "xrefs": ["clinvar:13919", "caid:CA009082", "dbsnp:74799832"], + "alternate_labels": ["MET918THR"], "extensions": [ { "name": "civic_representative_coordinate", @@ -495,9 +459,9 @@ def civic_vid113(): "representative_transcript": "ENST00000355710.3", "ensembl_version": 75, "reference_build": "GRCh37", - "type": "coordinates" + "type": "coordinates", }, - "type": "Extension" + "type": "Extension", } ], "structural_type": "SO:0001583", @@ -505,25 +469,25 @@ def civic_vid113(): { "syntax": "hgvs.c", "value": "NM_020975.4:c.2753T>C", - "type": "Expression" + "type": "Expression", }, { "syntax": "hgvs.p", "value": "NP_065681.1:p.Met918Thr", - "type": "Expression" + "type": "Expression", }, { "syntax": "hgvs.c", "value": "ENST00000355710.3:c.2753T>C", - "type": "Expression" + "type": "Expression", }, { "syntax": "hgvs.g", "value": "NC_000010.10:g.43617416T>C", - "type": "Expression" - } + "type": "Expression", + }, ], - "gene_context": "civic.gid:42" + "gene_context": "civic.gid:42", } @@ -535,9 +499,7 @@ def civic_did15(): "type": "DiseaseDescriptor", "label": "Thyroid Gland Medullary Carcinoma", "disease_id": "ncit:C3879", - "xrefs": [ - "DOID:3973" - ] + "xrefs": ["DOID:3973"], } @@ -548,7 +510,7 @@ def civic_gid42(): "id": "civic.gid:42", "type": "GeneDescriptor", "label": "RET", - "description": "RET mutations and the RET fusion RET-PTC lead to activation of this tyrosine kinase receptor and are associated with thyroid cancers. RET point mutations are the most common mutations identified in medullary thyroid cancer (MTC) with germline and somatic mutations in RET associated with hereditary and sporadic forms, respectively. The most common somatic form mutation is M918T (exon 16) and a variety of other mutations effecting exons 10, 11 and 15 have been described. The prognostic significance of these mutations have been hotly debated in the field, however, data suggests that some RET mutation may confer drug resistence. No RET-specific agents are currently clinically available but several promiscuous kinase inhibitors that target RET, among others, have been approved for MTC treatment.", # noqa: E501 + "description": "RET mutations and the RET fusion RET-PTC lead to activation of this tyrosine kinase receptor and are associated with thyroid cancers. RET point mutations are the most common mutations identified in medullary thyroid cancer (MTC) with germline and somatic mutations in RET associated with hereditary and sporadic forms, respectively. The most common somatic form mutation is M918T (exon 16) and a variety of other mutations effecting exons 10, 11 and 15 have been described. The prognostic significance of these mutations have been hotly debated in the field, however, data suggests that some RET mutation may confer drug resistence. No RET-specific agents are currently clinically available but several promiscuous kinase inhibitors that target RET, among others, have been approved for MTC treatment.", "gene_id": "hgnc:9967", "alternate_labels": [ "RET", @@ -559,11 +521,9 @@ def civic_gid42(): "MEN2A", "HSCR1", "CDHR16", - "CDHF12" + "CDHF12", ], - "xrefs": [ - "ncbigene:5979" - ] + "xrefs": ["ncbigene:5979"], } @@ -572,7 +532,7 @@ def civic_aid9_statement(): """Create a test fixture for CIViC AID9 statement.""" return { "id": "civic.aid:9", - "description": "ACVR1 G328V mutations occur within the kinase domain, leading to activation of downstream signaling. Exclusively seen in high-grade pediatric gliomas, supporting diagnosis of diffuse intrinsic pontine glioma.", # noqa: E501 + "description": "ACVR1 G328V mutations occur within the kinase domain, leading to activation of downstream signaling. Exclusively seen in high-grade pediatric gliomas, supporting diagnosis of diffuse intrinsic pontine glioma.", "direction": "supports", "evidence_level": "amp_asco_cap_2017_level:2C", "proposition": "proposition:Pjri4dU2VaEKcdKtVkoAUJ8bHFXnW2My", @@ -580,9 +540,8 @@ def civic_aid9_statement(): "variation_descriptor": "civic.vid:1686", "disease_descriptor": "civic.did:2950", "method": "method:2", - "supported_by": ["civic.eid:4846", - "civic.eid:6955"], - "type": "Statement" + "supported_by": ["civic.eid:4846", "civic.eid:6955"], + "type": "Statement", } @@ -594,7 +553,7 @@ def civic_aid9_proposition(): "predicate": "is_diagnostic_inclusion_criterion_for", "subject": "ga4gh:VA.yuvNtv-SpNOzcGsKsNnnK0n026rbfp6T", "object_qualifier": "DOID:0080684", - "type": "diagnostic_proposition" + "type": "diagnostic_proposition", } @@ -613,25 +572,16 @@ def civic_vid1686(): "interval": { "end": {"value": 328, "type": "Number"}, "start": {"value": 327, "type": "Number"}, - "type": "SequenceInterval" + "type": "SequenceInterval", }, "sequence_id": "ga4gh:SQ.6CnHhDq_bDCsuIBf0AzxtKq_lXYM7f0m", - "type": "SequenceLocation" + "type": "SequenceLocation", }, - "state": { - "sequence": "V", - "type": "LiteralSequenceExpression" - }, - "type": "Allele" + "state": {"sequence": "V", "type": "LiteralSequenceExpression"}, + "type": "Allele", }, - "xrefs": [ - "clinvar:376363", - "caid:CA16602802", - "dbsnp:387906589" - ], - "alternate_labels": [ - "GLY328VAL" - ], + "xrefs": ["clinvar:376363", "caid:CA16602802", "dbsnp:387906589"], + "alternate_labels": ["GLY328VAL"], "extensions": [ { "name": "civic_representative_coordinate", @@ -644,35 +594,31 @@ def civic_vid1686(): "representative_transcript": "ENST00000434821.1", "ensembl_version": 75, "reference_build": "GRCh37", - "type": "coordinates" + "type": "coordinates", }, - "type": "Extension" + "type": "Extension", } ], "structural_type": "SO:0001583", "expressions": [ - { - "syntax": "hgvs.c", - "value": "NM_001105.4:c.983G>T", - "type": "Expression" - }, + {"syntax": "hgvs.c", "value": "NM_001105.4:c.983G>T", "type": "Expression"}, { "syntax": "hgvs.p", "value": "NP_001096.1:p.Gly328Val", - "type": "Expression" + "type": "Expression", }, { "syntax": "hgvs.g", "value": "NC_000002.11:g.158622516C>A", - "type": "Expression" + "type": "Expression", }, { "syntax": "hgvs.c", "value": "ENST00000434821.1:c.983G>T", - "type": "Expression" - } + "type": "Expression", + }, ], - "gene_context": "civic.gid:154" + "gene_context": "civic.gid:154", } @@ -684,9 +630,7 @@ def civic_did2950(): "type": "DiseaseDescriptor", "label": "Diffuse Midline Glioma, H3 K27M-mutant", "disease_id": "DOID:0080684", - "xrefs": [ - "DOID:0080684" - ] + "xrefs": ["DOID:0080684"], } @@ -706,11 +650,9 @@ def civic_gid154(): "ALK2", "ACVRLK2", "ACVR1A", - "ACTRI" + "ACTRI", ], - "xrefs": [ - "ncbigene:90" - ] + "xrefs": ["ncbigene:90"], } @@ -719,7 +661,7 @@ def civic_eid26_statement(): """Create a test fixture for CIViC EID26 statement.""" return { "id": "civic.eid:26", - "description": "In acute myloid leukemia patients, D816 mutation is associated with earlier relapse and poorer prognosis than wildtype KIT.", # noqa: E501 + "description": "In acute myloid leukemia patients, D816 mutation is associated with earlier relapse and poorer prognosis than wildtype KIT.", "direction": "supports", "evidence_level": "civic.evidence_level:B", "proposition": "proposition:_HXqJtIo6MSmwagQUSOot4wdKE7O4DyN", @@ -728,7 +670,7 @@ def civic_eid26_statement(): "disease_descriptor": "civic.did:3", "method": "method:1", "supported_by": ["pmid:16384925"], - "type": "Statement" + "type": "Statement", } @@ -740,7 +682,7 @@ def civic_eid26_proposition(): "predicate": "is_prognostic_of_worse_outcome_for", "subject": "ga4gh:VA.QSLb0bR-CRIFfKIENdHhcuUZwW3IS1aP", "object_qualifier": "ncit:C3171", - "type": "prognostic_proposition" + "type": "prognostic_proposition", } @@ -759,25 +701,16 @@ def civic_vid65(): "interval": { "end": {"value": 820, "type": "Number"}, "start": {"value": 819, "type": "Number"}, - "type": "SequenceInterval" + "type": "SequenceInterval", }, "sequence_id": "ga4gh:SQ.TcMVFj5kDODDWpiy1d_1-3_gOf4BYaAB", - "type": "SequenceLocation" + "type": "SequenceLocation", }, - "state": { - "sequence": "V", - "type": "LiteralSequenceExpression" - }, - "type": "Allele" + "state": {"sequence": "V", "type": "LiteralSequenceExpression"}, + "type": "Allele", }, - "xrefs": [ - "clinvar:13852", - "caid:CA123513", - "dbsnp:121913507" - ], - "alternate_labels": [ - "ASP816VAL" - ], + "xrefs": ["clinvar:13852", "caid:CA123513", "dbsnp:121913507"], + "alternate_labels": ["ASP816VAL"], "extensions": [ { "name": "civic_representative_coordinate", @@ -790,9 +723,9 @@ def civic_vid65(): "representative_transcript": "ENST00000288135.5", "ensembl_version": 75, "reference_build": "GRCh37", - "type": "coordinates" + "type": "coordinates", }, - "type": "Extension" + "type": "Extension", } ], "structural_type": "SO:0001583", @@ -800,25 +733,25 @@ def civic_vid65(): { "syntax": "hgvs.c", "value": "NM_000222.2:c.2447A>T", - "type": "Expression" + "type": "Expression", }, { "syntax": "hgvs.p", "value": "NP_000213.1:p.Asp816Val", - "type": "Expression" + "type": "Expression", }, { "syntax": "hgvs.c", "value": "ENST00000288135.5:c.2447A>T", - "type": "Expression" + "type": "Expression", }, { "syntax": "hgvs.g", "value": "NC_000004.11:g.55599321A>T", - "type": "Expression" - } + "type": "Expression", + }, ], - "gene_context": "civic.gid:29" + "gene_context": "civic.gid:29", } @@ -830,9 +763,7 @@ def civic_did3(): "type": "DiseaseDescriptor", "label": "Acute Myeloid Leukemia", "disease_id": "ncit:C3171", - "xrefs": [ - "DOID:9119" - ] + "xrefs": ["DOID:9119"], } @@ -843,19 +774,10 @@ def civic_gid29(): "id": "civic.gid:29", "type": "GeneDescriptor", "label": "KIT", - "description": "c-KIT activation has been shown to have oncogenic activity in gastrointestinal stromal tumors (GISTs), melanomas, lung cancer, and other tumor types. The targeted therapeutics nilotinib and sunitinib have shown efficacy in treating KIT overactive patients, and are in late-stage trials in melanoma and GIST. KIT overactivity can be the result of many genomic events from genomic amplification to overexpression to missense mutations. Missense mutations have been shown to be key players in mediating clinical response and acquired resistance in patients being treated with these targeted therapeutics.", # noqa: E501 + "description": "c-KIT activation has been shown to have oncogenic activity in gastrointestinal stromal tumors (GISTs), melanomas, lung cancer, and other tumor types. The targeted therapeutics nilotinib and sunitinib have shown efficacy in treating KIT overactive patients, and are in late-stage trials in melanoma and GIST. KIT overactivity can be the result of many genomic events from genomic amplification to overexpression to missense mutations. Missense mutations have been shown to be key players in mediating clinical response and acquired resistance in patients being treated with these targeted therapeutics.", "gene_id": "hgnc:6342", - "alternate_labels": [ - "MASTC", - "KIT", - "SCFR", - "PBT", - "CD117", - "C-Kit" - ], - "xrefs": [ - "ncbigene:3815" - ] + "alternate_labels": ["MASTC", "KIT", "SCFR", "PBT", "CD117", "C-Kit"], + "xrefs": ["ncbigene:3815"], } @@ -864,7 +786,7 @@ def civic_eid1756_statement(): """Create test fixture for CIViC EID1756 statement.""" return { "id": "civic.eid:1756", - "description": "Study of 1817 PCa cases and 2026 cancer free controls to clarify the association of (MTHFR)c.677C>T (and c.1298A>C ) of pancreatic cancer risk in a population of Han Chinese in Shanghai. Results indicated a lower risk for the heterozygous CT genotype and homozygous TT genotype carriers of (MTHFR)c.677C>T which had a significantly lower risk of developing pancreatic cancer compared with the wild-type CC genotype.", # noqa: E501 + "description": "Study of 1817 PCa cases and 2026 cancer free controls to clarify the association of (MTHFR)c.677C>T (and c.1298A>C ) of pancreatic cancer risk in a population of Han Chinese in Shanghai. Results indicated a lower risk for the heterozygous CT genotype and homozygous TT genotype carriers of (MTHFR)c.677C>T which had a significantly lower risk of developing pancreatic cancer compared with the wild-type CC genotype.", "direction": "supports", "evidence_level": "civic.evidence_level:B", "proposition": "proposition:cDLAt3AJPrHQPQ--JpKU4MkU528_kE-a", @@ -873,7 +795,7 @@ def civic_eid1756_statement(): "disease_descriptor": "civic.did:556", "method": "method:1", "supported_by": ["pmid:27819322"], - "type": "Statement" + "type": "Statement", } @@ -885,7 +807,7 @@ def civic_eid1756_proposition(): "predicate": "is_prognostic_of_better_outcome_for", "subject": "ga4gh:VA.Nq7ozfH2X6m1PGr_n38E-F0NZ7I9UASP", "object_qualifier": "ncit:C9005", - "type": "prognostic_proposition" + "type": "prognostic_proposition", } @@ -904,26 +826,16 @@ def civic_vid258(): "interval": { "end": {"value": 222, "type": "Number"}, "start": {"value": 221, "type": "Number"}, - "type": "SequenceInterval" + "type": "SequenceInterval", }, "sequence_id": "ga4gh:SQ.4RSETawLfMkNpQBPepa7Uf9ItHAEJUde", - "type": "SequenceLocation" - }, - "state": { - "sequence": "V", - "type": "LiteralSequenceExpression" + "type": "SequenceLocation", }, - "type": "Allele" + "state": {"sequence": "V", "type": "LiteralSequenceExpression"}, + "type": "Allele", }, - "xrefs": [ - "clinvar:3520", - "caid:CA170990", - "dbsnp:1801133" - ], - "alternate_labels": [ - "C677T", - "ALA222VAL" - ], + "xrefs": ["clinvar:3520", "caid:CA170990", "dbsnp:1801133"], + "alternate_labels": ["C677T", "ALA222VAL"], "extensions": [ { "name": "civic_representative_coordinate", @@ -936,35 +848,31 @@ def civic_vid258(): "representative_transcript": "ENST00000376592.1", "ensembl_version": 75, "reference_build": "GRCh37", - "type": "coordinates" + "type": "coordinates", }, - "type": "Extension" + "type": "Extension", } ], "structural_type": "SO:0001583", "expressions": [ - { - "syntax": "hgvs.c", - "value": "NM_005957.4:c.665C>T", - "type": "Expression" - }, + {"syntax": "hgvs.c", "value": "NM_005957.4:c.665C>T", "type": "Expression"}, { "syntax": "hgvs.p", "value": "NP_005948.3:p.Ala222Val", - "type": "Expression" + "type": "Expression", }, { "syntax": "hgvs.c", "value": "ENST00000376592.1:c.665G>A", - "type": "Expression" + "type": "Expression", }, { "syntax": "hgvs.g", "value": "NC_000001.10:g.11856378G>A", - "type": "Expression" - } + "type": "Expression", + }, ], - "gene_context": "civic.gid:3672" + "gene_context": "civic.gid:3672", } @@ -976,9 +884,7 @@ def civic_did556(): "type": "DiseaseDescriptor", "label": "Pancreatic Cancer", "disease_id": "ncit:C9005", - "xrefs": [ - "DOID:1793" - ] + "xrefs": ["DOID:1793"], } @@ -990,12 +896,8 @@ def civic_gid3672(): "type": "GeneDescriptor", "label": "MTHFR", "gene_id": "hgnc:7436", - "alternate_labels": [ - "MTHFR" - ], - "xrefs": [ - "ncbigene:4524" - ] + "alternate_labels": ["MTHFR"], + "xrefs": ["ncbigene:4524"], } @@ -1006,7 +908,7 @@ def pmid_15146165(): "id": "pmid:15146165", "label": "Lasota et al., 2004, Lab. Invest.", "type": "Document", - "description": "A great majority of GISTs with PDGFRA mutations represent gastric tumors of low or no malignant potential." # noqa: E501 + "description": "A great majority of GISTs with PDGFRA mutations represent gastric tumors of low or no malignant potential.", } @@ -1017,7 +919,7 @@ def pmid_18073307(): "type": "Document", "id": "pmid:18073307", "label": "Elisei et al., 2008, J. Clin. Endocrinol. Metab.", - "description": "Prognostic significance of somatic RET oncogene mutations in sporadic medullary thyroid cancer: a 10-year follow-up study." # noqa: E501 + "description": "Prognostic significance of somatic RET oncogene mutations in sporadic medullary thyroid cancer: a 10-year follow-up study.", } @@ -1027,8 +929,8 @@ def pmid_16384925(): return { "id": "pmid:16384925", "label": "Cairoli et al., 2006, Blood", - "description": "Prognostic impact of c-KIT mutations in core binding factor leukemias: an Italian retrospective study.", # noqa: E501 - "type": "Document" + "description": "Prognostic impact of c-KIT mutations in core binding factor leukemias: an Italian retrospective study.", + "type": "Document", } @@ -1039,8 +941,8 @@ def pmid_27819322(): "type": "Document", "id": "pmid:27819322", "label": "Wu et al., 2016, Sci Rep", - "description": "MTHFR c.677C>T Inhibits Cell Proliferation and Decreases Prostate Cancer Susceptibility in the Han Chinese Population in Shanghai.", # noqa: E501 - "xrefs": ["pmc:PMC5098242"] + "description": "MTHFR c.677C>T Inhibits Cell Proliferation and Decreases Prostate Cancer Susceptibility in the Han Chinese Population in Shanghai.", + "xrefs": ["pmc:PMC5098242"], } @@ -1049,7 +951,7 @@ def moa_aid71_statement(): """Create a MOA Statement 71 test fixture.""" return { "id": "moa.assertion:71", - "description": "T315I mutant ABL1 in p210 BCR-ABL cells resulted in retained high levels of phosphotyrosine at increasing concentrations of inhibitor STI-571, whereas wildtype appropriately received inhibition.", # noqa: E501 + "description": "T315I mutant ABL1 in p210 BCR-ABL cells resulted in retained high levels of phosphotyrosine at increasing concentrations of inhibitor STI-571, whereas wildtype appropriately received inhibition.", "evidence_level": "moa.evidence_level:Preclinical", "proposition": "proposition:4BRAy5ckYBfbzLHr95Xz3M9D9mJpTRxr", "variation_origin": "somatic", @@ -1057,10 +959,8 @@ def moa_aid71_statement(): "therapy_descriptor": "moa.normalize.therapy:Imatinib", "disease_descriptor": "moa.normalize.disease:oncotree%3ACML", "method": "method:4", - "supported_by": [ - "pmid:11423618" - ], - "type": "Statement" + "supported_by": ["pmid:11423618"], + "type": "Statement", } @@ -1073,7 +973,7 @@ def moa_aid71_proposition(): "subject": "ga4gh:VA.M3CbaYfwomLqvJbdK4w-W7V-zw7LdjGj", "object_qualifier": "ncit:C3174", "object": "rxcui:282388", - "type": "therapeutic_response_proposition" + "type": "therapeutic_response_proposition", } @@ -1092,16 +992,13 @@ def moa_vid71(): "interval": { "end": {"value": 315, "type": "Number"}, "start": {"value": 314, "type": "Number"}, - "type": "SequenceInterval" + "type": "SequenceInterval", }, "sequence_id": "ga4gh:SQ.dmFigTG-0fY6I54swb7PoDuxCeT6O3Wg", - "type": "SequenceLocation" + "type": "SequenceLocation", }, - "state": { - "sequence": "I", - "type": "LiteralSequenceExpression" - }, - "type": "Allele" + "state": {"sequence": "I", "type": "LiteralSequenceExpression"}, + "type": "Allele", }, "extensions": [ { @@ -1114,13 +1011,13 @@ def moa_vid71(): "alternate_allele": "T", "cdna_change": "c.944C>T", "protein_change": "p.T315I", - "exon": "5" + "exon": "5", }, - "type": "Extension" + "type": "Extension", } ], "vrs_ref_allele_seq": "T", - "gene_context": "moa.normalize.gene:ABL1" + "gene_context": "moa.normalize.gene:ABL1", } @@ -1131,7 +1028,7 @@ def moa_abl1(): "id": "moa.normalize.gene:ABL1", "type": "GeneDescriptor", "label": "ABL1", - "gene_id": "hgnc:76" + "gene_id": "hgnc:76", } @@ -1143,63 +1040,65 @@ def moa_imatinib(): "type": "TherapyDescriptor", "label": "Imatinib", "therapy_id": "rxcui:282388", - "extensions": [{ - "type": "Extension", - "name": "regulatory_approval", - "value": { - "approval_rating": "FDA", - "has_indications": [ - { - "id": "hemonc:634", - "type": "DiseaseDescriptor", - "label": "Myelodysplastic syndrome", - "disease_id": "ncit:C3247" - }, - { - "id": "hemonc:616", - "type": "DiseaseDescriptor", - "label": "Hypereosinophilic syndrome", - "disease_id": "ncit:C27038" - }, - { - "id": "hemonc:582", - "type": "DiseaseDescriptor", - "label": "Chronic myelogenous leukemia", - "disease_id": "ncit:C3174" - }, - { - "id": "hemonc:669", - "type": "DiseaseDescriptor", - "label": "Systemic mastocytosis", - "disease_id": "ncit:C9235" - }, - { - "id": "hemonc:24309", - "type": "DiseaseDescriptor", - "label": "Acute lymphoblastic leukemia", - "disease_id": "ncit:C3167" - }, - { - "id": "hemonc:667", - "type": "DiseaseDescriptor", - "label": "Soft tissue sarcoma", - "disease_id": "ncit:C9306" - }, - { - "id": "hemonc:602", - "type": "DiseaseDescriptor", - "label": "Gastrointestinal stromal tumor", - "disease_id": "ncit:C3868" - }, - { - "id": "hemonc:33893", - "type": "DiseaseDescriptor", - "label": "Chronic myelogenous leukemia pediatric", - "disease_id": None - } - ] + "extensions": [ + { + "type": "Extension", + "name": "regulatory_approval", + "value": { + "approval_rating": "FDA", + "has_indications": [ + { + "id": "hemonc:634", + "type": "DiseaseDescriptor", + "label": "Myelodysplastic syndrome", + "disease_id": "ncit:C3247", + }, + { + "id": "hemonc:616", + "type": "DiseaseDescriptor", + "label": "Hypereosinophilic syndrome", + "disease_id": "ncit:C27038", + }, + { + "id": "hemonc:582", + "type": "DiseaseDescriptor", + "label": "Chronic myelogenous leukemia", + "disease_id": "ncit:C3174", + }, + { + "id": "hemonc:669", + "type": "DiseaseDescriptor", + "label": "Systemic mastocytosis", + "disease_id": "ncit:C9235", + }, + { + "id": "hemonc:24309", + "type": "DiseaseDescriptor", + "label": "Acute lymphoblastic leukemia", + "disease_id": "ncit:C3167", + }, + { + "id": "hemonc:667", + "type": "DiseaseDescriptor", + "label": "Soft tissue sarcoma", + "disease_id": "ncit:C9306", + }, + { + "id": "hemonc:602", + "type": "DiseaseDescriptor", + "label": "Gastrointestinal stromal tumor", + "disease_id": "ncit:C3868", + }, + { + "id": "hemonc:33893", + "type": "DiseaseDescriptor", + "label": "Chronic myelogenous leukemia pediatric", + "disease_id": None, + }, + ], + }, } - }] + ], } @@ -1210,7 +1109,7 @@ def moa_chronic_myelogenous_leukemia(): "id": "moa.normalize.disease:oncotree%3ACML", "type": "DiseaseDescriptor", "label": "Chronic Myelogenous Leukemia", - "disease_id": "ncit:C3174" + "disease_id": "ncit:C3174", } @@ -1219,15 +1118,11 @@ def method1(): """Create test fixture for method:1.""" return { "id": "method:1", - "label": "Standard operating procedure for curation and clinical interpretation of variants in cancer", # noqa: E501 - "url": "https://genomemedicine.biomedcentral.com/articles/10.1186/s13073-019-0687-x", # noqa: E501 - "version": { - "year": 2019, - "month": 11, - "day": 29 - }, + "label": "Standard operating procedure for curation and clinical interpretation of variants in cancer", + "url": "https://genomemedicine.biomedcentral.com/articles/10.1186/s13073-019-0687-x", + "version": {"year": 2019, "month": 11, "day": 29}, "authors": "Danos, A.M., Krysiak, K., Barnell, E.K. et al.", - "type": "Method" + "type": "Method", } @@ -1237,13 +1132,10 @@ def method2(): return { "id": "method:2", "type": "Method", - "label": "Standards and Guidelines for the Interpretation and Reporting of Sequence Variants in Cancer: A Joint Consensus Recommendation of the Association for Molecular Pathology, American Society of Clinical Oncology, and College of American Pathologists", # noqa: E501 + "label": "Standards and Guidelines for the Interpretation and Reporting of Sequence Variants in Cancer: A Joint Consensus Recommendation of the Association for Molecular Pathology, American Society of Clinical Oncology, and College of American Pathologists", "url": "https://pubmed.ncbi.nlm.nih.gov/27993330/", - "version": { - "year": 2017, - "month": 1 - }, - "authors": "Li MM, Datto M, Duncavage EJ, et al." + "version": {"year": 2017, "month": 1}, + "authors": "Li MM, Datto M, Duncavage EJ, et al.", } @@ -1252,14 +1144,11 @@ def method3(): """Create test fixture for method:3.""" return { "id": "method:3", - "label": "Standards and guidelines for the interpretation of sequence variants: a joint consensus recommendation of the American College of Medical Genetics and Genomics and the Association for Molecular Pathology", # noqa: E501 + "label": "Standards and guidelines for the interpretation of sequence variants: a joint consensus recommendation of the American College of Medical Genetics and Genomics and the Association for Molecular Pathology", "url": "https://pubmed.ncbi.nlm.nih.gov/25741868/", - "version": { - "year": 2015, - "month": 5 - }, + "version": {"year": 2015, "month": 5}, "type": "Method", - "authors": "Richards S, Aziz N, Bale S, et al." + "authors": "Richards S, Aziz N, Bale S, et al.", } @@ -1268,15 +1157,11 @@ def method4(): """Create a test fixture for MOA method:4.""" return { "id": "method:4", - "label": "Clinical interpretation of integrative molecular profiles to guide precision cancer medicine", # noqa: E501 + "label": "Clinical interpretation of integrative molecular profiles to guide precision cancer medicine", "url": "https://www.biorxiv.org/content/10.1101/2020.09.22.308833v1", "type": "Method", - "version": { - "year": 2020, - "month": 9, - "day": 22 - }, - "authors": "Reardon, B., Moore, N.D., Moore, N. et al." + "version": {"year": 2020, "month": 9, "day": 22}, + "authors": "Reardon, B., Moore, N.D., Moore, N. et al.", } @@ -1291,11 +1176,9 @@ def pmid_11423618(): """Create a test fixture for PMID 11423618.""" return { "id": "pmid:11423618", - "label": "Gorre, Mercedes E., et al. \"Clinical resistance to STI-571 cancer therapy caused by BCR-ABL gene mutation or amplification.\" Science 293.5531 (2001): 876-880.", # noqa: E501 - "xrefs": [ - "doi:10.1126/science.1062538" - ], - "type": "Document" + "label": 'Gorre, Mercedes E., et al. "Clinical resistance to STI-571 cancer therapy caused by BCR-ABL gene mutation or amplification." Science 293.5531 (2001): 876-880.', + "xrefs": ["doi:10.1126/science.1062538"], + "type": "Document", } @@ -1310,7 +1193,7 @@ def oncokb_diagnostic_statement1(): "variation_descriptor": "oncokb.variant:BRAF%20V600E", "disease_descriptor": "oncokb.disease:611", "method": "method:5", - "supported_by": ["pmid:25422482", "pmid:26637772"] + "supported_by": ["pmid:25422482", "pmid:26637772"], } @@ -1322,7 +1205,7 @@ def oncokb_diagnostic_proposition1(): "type": "diagnostic_proposition", "predicate": "is_diagnostic_inclusion_criterion_for", "subject": "ga4gh:VA.ZDdoQdURgO2Daj2NxLj4pcDnjiiAsfbO", - "object_qualifier": "ncit:C53972" + "object_qualifier": "ncit:C53972", } @@ -1331,7 +1214,7 @@ def oncokb_therapeutic_statement1(): """Create test fixture for OncoKB BRAF V600E therapeutic evidence""" return { "id": "oncokb.evidence:xKWfpPS0aNLElHg9v3mwmb9WMaT8P1pf", - "description": "Trametinib is an oral small molecule inhibitor of MEK1/2 that is FDA-approved alone or with dabrafenib for the treatment of patients with metastatic melanoma harboring a V600E or V600K BRAF mutation. In an open-label, randomized Phase III trial, patients with BRAF V600E/K-mutated unresectable, metastatic melanoma received oral trametinib (2 mg once daily) or an intravenous regimen of either dacarbazine (1000 mg/m2) or paclitaxel (175 mg/m2) every three weeks. Trametinib demonstrated improved progression-free survival (HR for disease progression or death = 0.45) and six-month overall survival (81% vs. 67%; death HR = 0.54; p=0.01) (PMID: 22663011). However, like other MEK inhibitors, the benefit of trametinib is limited by adverse reactions, most notably grade three or four rash and diarrhea (PMID: 22663011). Trametinib is not typically used as monotherapy for patients with BRAF V600K melanoma given its lower response rate compared to BRAF inhibitors and combined BRAF and MEK inhibitors. Patients previously treated with a RAF inhibitor appear to be less likely than untreated patients to respond to trametinib treatment (PMID: 22663011), and FDA guidelines state that trametinib as a monotherapy is not indicated for these patients. Dabrafenib and trametinib are FDA-approved as a combination therapy, which has superior clinical outcomes compared to dabrafenib or trametinib monotherapy (PMID: 25399551, 25265492). Additionally, patients with melanoma treated with dabrafenib and trametinib in both the neoadjuvant and adjuvant settings had improved survival over patients given standard of care (PMID: 29361468).", # noqa: E501 + "description": "Trametinib is an oral small molecule inhibitor of MEK1/2 that is FDA-approved alone or with dabrafenib for the treatment of patients with metastatic melanoma harboring a V600E or V600K BRAF mutation. In an open-label, randomized Phase III trial, patients with BRAF V600E/K-mutated unresectable, metastatic melanoma received oral trametinib (2 mg once daily) or an intravenous regimen of either dacarbazine (1000 mg/m2) or paclitaxel (175 mg/m2) every three weeks. Trametinib demonstrated improved progression-free survival (HR for disease progression or death = 0.45) and six-month overall survival (81% vs. 67%; death HR = 0.54; p=0.01) (PMID: 22663011). However, like other MEK inhibitors, the benefit of trametinib is limited by adverse reactions, most notably grade three or four rash and diarrhea (PMID: 22663011). Trametinib is not typically used as monotherapy for patients with BRAF V600K melanoma given its lower response rate compared to BRAF inhibitors and combined BRAF and MEK inhibitors. Patients previously treated with a RAF inhibitor appear to be less likely than untreated patients to respond to trametinib treatment (PMID: 22663011), and FDA guidelines state that trametinib as a monotherapy is not indicated for these patients. Dabrafenib and trametinib are FDA-approved as a combination therapy, which has superior clinical outcomes compared to dabrafenib or trametinib monotherapy (PMID: 25399551, 25265492). Additionally, patients with melanoma treated with dabrafenib and trametinib in both the neoadjuvant and adjuvant settings had improved survival over patients given standard of care (PMID: 29361468).", "type": "Statement", "evidence_level": "oncokb.evidence_level:LEVEL_1", "proposition": "proposition:EOEfYXjsyQmgV2sNA-gfK5i0Cj8WGGuw", @@ -1339,18 +1222,22 @@ def oncokb_therapeutic_statement1(): "disease_descriptor": "oncokb.disease:453", "therapy_descriptor": "oncokb.normalize.therapy:Trametinib", "method": "method:5", - "supported_by": ["pmid:29361468", "pmid:25399551", "pmid:22663011", - "pmid:25265492"], + "supported_by": [ + "pmid:29361468", + "pmid:25399551", + "pmid:22663011", + "pmid:25265492", + ], "extensions": [ { "type": "Extension", "name": "onckb_fda_level", "value": { "level": "LEVEL_Fda2", - "description": "Cancer Mutations with Evidence of Clinical Significance" # noqa: E501 - } + "description": "Cancer Mutations with Evidence of Clinical Significance", + }, } - ] + ], } @@ -1363,7 +1250,7 @@ def oncokb_therapeutic_proposition1(): "predicate": "predicts_sensitivity_to", "subject": "ga4gh:VA.ZDdoQdURgO2Daj2NxLj4pcDnjiiAsfbO", "object_qualifier": "ncit:C3224", - "object": "rxcui:1425098" + "object": "rxcui:1425098", } @@ -1383,30 +1270,23 @@ def oncokb_braf_v600e_vd(): "interval": { "end": {"value": 600, "type": "Number"}, "start": {"value": 599, "type": "Number"}, - "type": "SequenceInterval" + "type": "SequenceInterval", }, "sequence_id": "ga4gh:SQ.cQvw4UsHHRRlogxbWCB8W-mKD4AraM9y", - "type": "SequenceLocation" + "type": "SequenceLocation", }, - "state": { - "sequence": "E", - "type": "LiteralSequenceExpression" - }, - "type": "Allele" + "state": {"sequence": "E", "type": "LiteralSequenceExpression"}, + "type": "Allele", }, "gene_context": "oncokb.normalize.gene:BRAF", "extensions": [ - { - "type": "Extension", - "name": "oncogenic", - "value": "Oncogenic" - }, + {"type": "Extension", "name": "oncogenic", "value": "Oncogenic"}, { "type": "Extension", "name": "mutation_effect", "value": { "knownEffect": "Gain-of-function", - "description": "The class I activating exon 15 BRAF V600E mutation is located in the kinase domain of the BRAF protein and is highly recurrent in melanoma, lung and thyroid cancer, among others (PMID: 28783719, 26091043, 25079552, 23833300, 25417114, 28783719, 12068308). This mutation has been comprehensively biologically characterized and has been shown to activate the downstream MAPK pathway independent of RAS (PMID: 15035987, 12068308, 19251651, 26343582), to render BRAF constitutively activated in monomeric form (PMID: 20179705), and to retain sensitivity to RAF monomer inhibitors such as vemurafenib and dabrafenib (PMID:26343582, 28783719, 20179705, 30351999).", # noqa: E501 + "description": "The class I activating exon 15 BRAF V600E mutation is located in the kinase domain of the BRAF protein and is highly recurrent in melanoma, lung and thyroid cancer, among others (PMID: 28783719, 26091043, 25079552, 23833300, 25417114, 28783719, 12068308). This mutation has been comprehensively biologically characterized and has been shown to activate the downstream MAPK pathway independent of RAS (PMID: 15035987, 12068308, 19251651, 26343582), to render BRAF constitutively activated in monomeric form (PMID: 20179705), and to retain sensitivity to RAF monomer inhibitors such as vemurafenib and dabrafenib (PMID:26343582, 28783719, 20179705, 30351999).", "citations": { "pmids": [ "25417114", @@ -1419,43 +1299,31 @@ def oncokb_braf_v600e_vd(): "25079552", "28783719", "19251651", - "15035987" + "15035987", ], - "abstracts": [] - } - } - }, - { - "type": "Extension", - "name": "hotspot", - "value": True - }, - { - "type": "Extension", - "name": "vus", - "value": False + "abstracts": [], + }, + }, }, + {"type": "Extension", "name": "hotspot", "value": True}, + {"type": "Extension", "name": "vus", "value": False}, { "type": "Extension", "name": "oncokb_highest_sensitive_level", - "value": "LEVEL_1" + "value": "LEVEL_1", }, { "type": "Extension", "name": "oncokb_highest_diagnostic_implication_level", - "value": "LEVEL_Dx2" + "value": "LEVEL_Dx2", }, { "type": "Extension", "name": "oncokb_highest_fda_level", - "value": "LEVEL_Fda2" + "value": "LEVEL_Fda2", }, - { - "type": "Extension", - "name": "allele_exist", - "value": True - } - ] + {"type": "Extension", "name": "allele_exist", "value": True}, + ], } @@ -1467,50 +1335,42 @@ def oncokb_braf_gene_descriptor(): "type": "GeneDescriptor", "label": "BRAF", "gene_id": "hgnc:1097", - "description": "BRAF, an intracellular kinase, is frequently mutated in melanoma, thyroid and lung cancers among others.", # noqa: E501 + "description": "BRAF, an intracellular kinase, is frequently mutated in melanoma, thyroid and lung cancers among others.", "xrefs": ["ncbigene:673"], "extensions": [ { "type": "Extension", "name": "ensembl_transcript_GRCh37", - "value": "ENST00000288602" + "value": "ENST00000288602", }, { "type": "Extension", "name": "refseq_transcript_GRCh37", - "value": "NM_004333.4" + "value": "NM_004333.4", }, { "type": "Extension", "name": "ensembl_transcript_GRCh38", - "value": "ENST00000646891" + "value": "ENST00000646891", }, { "type": "Extension", "name": "refseq_transcript_GRCh38", - "value": "NM_004333.4" - }, - { - "type": "Extension", - "name": "oncogene", - "value": True + "value": "NM_004333.4", }, + {"type": "Extension", "name": "oncogene", "value": True}, { "type": "Extension", "name": "oncokb_highest_sensitive_level", - "value": "1" + "value": "1", }, { "type": "Extension", "name": "oncokb_background", - "value": "BRAF is a serine/threonine kinase that plays a key role in the regulation of the mitogen-activated protein kinase (MAPK) cascade (PMID: 15520807), which under physiologic conditions regulates the expression of genes involved in cellular functions, including proliferation (PMID: 24202393). Genetic alterations in BRAF are found in a large percentage of melanomas, thyroid cancers and histiocytic neoplasms as well as a small fraction of lung and colorectal cancers. The most common BRAF point mutation is V600E, which deregulates the protein's kinase activity leading to constitutive BRAF activation, as BRAF V600E can signal as a monomer independently of RAS or upstream activation (PMID: 20179705). Other BRAF mutations have been found that affect the protein's propensity to dimerize (PMID: 16858395, 26343582, 12068308). The product of these alterations is a BRAF kinase that can activate MAPK signaling in an unregulated manner and, in some instances, is directly responsible for cancer growth (PMID: 15520807). Inhibitors of mutant BRAF, including vemurafenib and dabrafenib, are FDA-approved for the treatment of late-stage or unresectable melanoma.", # noqa: E501 + "value": "BRAF is a serine/threonine kinase that plays a key role in the regulation of the mitogen-activated protein kinase (MAPK) cascade (PMID: 15520807), which under physiologic conditions regulates the expression of genes involved in cellular functions, including proliferation (PMID: 24202393). Genetic alterations in BRAF are found in a large percentage of melanomas, thyroid cancers and histiocytic neoplasms as well as a small fraction of lung and colorectal cancers. The most common BRAF point mutation is V600E, which deregulates the protein's kinase activity leading to constitutive BRAF activation, as BRAF V600E can signal as a monomer independently of RAS or upstream activation (PMID: 20179705). Other BRAF mutations have been found that affect the protein's propensity to dimerize (PMID: 16858395, 26343582, 12068308). The product of these alterations is a BRAF kinase that can activate MAPK signaling in an unregulated manner and, in some instances, is directly responsible for cancer growth (PMID: 15520807). Inhibitors of mutant BRAF, including vemurafenib and dabrafenib, are FDA-approved for the treatment of late-stage or unresectable melanoma.", }, - { - "type": "Extension", - "name": "tumor_suppressor_gene", - "value": False - } - ] + {"type": "Extension", "name": "tumor_suppressor_gene", "value": False}, + ], } @@ -1525,10 +1385,10 @@ def oncokb_trametinib_therapy_descriptor(): "alternate_labels": [ "JTP-74057", "MEK Inhibitor GSK1120212", - "N-(3-{3-cyclopropyl-5-[(2-fluoro-4-iodophenyl)amino]-6,8-dimethyl-2,4,7-trioxo-3,4,6,7-tetrahydropyrido[4,3-d]pyrimidin-1(2H)-yl}phenyl)acetamide", # noqa: E501 + "N-(3-{3-cyclopropyl-5-[(2-fluoro-4-iodophenyl)amino]-6,8-dimethyl-2,4,7-trioxo-3,4,6,7-tetrahydropyrido[4,3-d]pyrimidin-1(2H)-yl}phenyl)acetamide", "GSK1120212", "TRAMETINIB", - "Trametinib" + "Trametinib", ], "xrefs": ["ncit:C77908"], "extensions": [ @@ -1542,12 +1402,12 @@ def oncokb_trametinib_therapy_descriptor(): "id": "mesh:D009369", "type": "DiseaseDescriptor", "label": "Neoplasms", - "disease_id": "ncit:C3262" + "disease_id": "ncit:C3262", } - ] - } + ], + }, } - ] + ], } @@ -1564,33 +1424,13 @@ def oncokb_ecd_disease_descriptor(): { "type": "Extension", "name": "oncotree_main_type", - "value": { - "id": None, - "name": "Histiocytosis", - "tumor_form": "LIQUID" - } - }, - { - "type": "Extension", - "name": "tissue", - "value": "Myeloid" - }, - { - "type": "Extension", - "name": "parent", - "value": "HDCN" - }, - { - "type": "Extension", - "name": "level", - "value": 4 + "value": {"id": None, "name": "Histiocytosis", "tumor_form": "LIQUID"}, }, - { - "type": "Extension", - "name": "tumor_form", - "value": "LIQUID" - } - ] + {"type": "Extension", "name": "tissue", "value": "Myeloid"}, + {"type": "Extension", "name": "parent", "value": "HDCN"}, + {"type": "Extension", "name": "level", "value": 4}, + {"type": "Extension", "name": "tumor_form", "value": "LIQUID"}, + ], } @@ -1607,33 +1447,13 @@ def oncokb_mel_disease_descriptor(): { "type": "Extension", "name": "oncotree_main_type", - "value": { - "id": None, - "name": "Melanoma", - "tumor_form": "SOLID" - } - }, - { - "type": "Extension", - "name": "tissue", - "value": "Skin" - }, - { - "type": "Extension", - "name": "parent", - "value": "SKIN" - }, - { - "type": "Extension", - "name": "level", - "value": 2 + "value": {"id": None, "name": "Melanoma", "tumor_form": "SOLID"}, }, - { - "type": "Extension", - "name": "tumor_form", - "value": "SOLID" - } - ] + {"type": "Extension", "name": "tissue", "value": "Skin"}, + {"type": "Extension", "name": "parent", "value": "SKIN"}, + {"type": "Extension", "name": "level", "value": 2}, + {"type": "Extension", "name": "tumor_form", "value": "SOLID"}, + ], } @@ -1641,16 +1461,8 @@ def oncokb_mel_disease_descriptor(): def oncokb_diagnostic1_documents(): """Create test fixture for OncoKB diagnostic evidence 1 documents""" return [ - { - "id": "pmid:25422482", - "label": "PubMed 25422482", - "type": "Document" - }, - { - "id": "pmid:26637772", - "label": "PubMed 26637772", - "type": "Document" - } + {"id": "pmid:25422482", "label": "PubMed 25422482", "type": "Document"}, + {"id": "pmid:26637772", "label": "PubMed 26637772", "type": "Document"}, ] @@ -1661,27 +1473,15 @@ def oncokb_therapeutic1_documents_query(): and a more detailed label for pmid:22663011 """ return [ - { - "id": "pmid:29361468", - "label": "PubMed 29361468", - "type": "Document" - }, - { - "id": "pmid:25399551", - "label": "PubMed 25399551", - "type": "Document" - }, + {"id": "pmid:29361468", "label": "PubMed 29361468", "type": "Document"}, + {"id": "pmid:25399551", "label": "PubMed 25399551", "type": "Document"}, { "id": "pmid:22663011", "label": "Flaherty et al., 2012, N. Engl. J. Med.", - "description": "Improved survival with MEK inhibition in BRAF-mutated melanoma.", # noqa: E501 - "type": "Document" + "description": "Improved survival with MEK inhibition in BRAF-mutated melanoma.", + "type": "Document", }, - { - "id": "pmid:25265492", - "label": "PubMed 25265492", - "type": "Document" - } + {"id": "pmid:25265492", "label": "PubMed 25265492", "type": "Document"}, ] @@ -1692,12 +1492,9 @@ def oncokb_method(): "id": "method:5", "label": "OncoKB Curation Standard Operating Procedure", "url": "https://sop.oncokb.org/", - "version": { - "year": 2021, - "month": 11 - }, + "version": {"year": 2021, "month": 11}, "authors": "OncoKB", - "type": "Method" + "type": "Method", } @@ -1710,6 +1507,7 @@ def sources_count() -> int: @pytest.fixture(scope="session") def check_statement(): """Create a test fixture to compare statements.""" + def check_statement(actual, test): """Check that statements are match.""" assert actual.keys() == test.keys() @@ -1736,12 +1534,14 @@ def check_statement(actual, test): assert actual["method"] == test["method"] assert set(actual["supported_by"]) == set(test["supported_by"]) assert actual["type"] == test["type"] + return check_statement @pytest.fixture(scope="session") def check_proposition(): """Create a test fixture to compare propositions.""" + def check_proposition(actual, test): """Check that propositions match.""" assert actual.keys() == test.keys() @@ -1754,20 +1554,29 @@ def check_proposition(actual, test): assert actual["predicate"] == test["predicate"] assert actual["subject"] == test["subject"] assert actual["object_qualifier"] == test["object_qualifier"] + return check_proposition @pytest.fixture(scope="session") def check_variation_descriptor(): """Create a test fixture to compare variation descriptors.""" + def check_variation_descriptor(actual, test, check_descriptor=None, nested=False): """Check that variation descriptors match.""" actual_keys = actual.keys() test_keys = test.keys() assert actual_keys == test_keys for key in test_keys: - if key in ["id", "type", "label", "description", "variation_id", - "structural_type", "vrs_ref_allele_seq"]: + if key in [ + "id", + "type", + "label", + "description", + "variation_id", + "structural_type", + "vrs_ref_allele_seq", + ]: assert actual[key] == test[key] elif key == "gene_context": if nested: @@ -1783,8 +1592,7 @@ def check_variation_descriptor(actual, test, check_descriptor=None, nested=False for test_extension in test["extensions"]: for actual_extension in actual["extensions"]: if test_extension["name"] == actual_extension["name"]: - if test_extension["name"] != \ - "civic_actionability_score": + if test_extension["name"] != "civic_actionability_score": assert actual_extension == test_extension else: try: @@ -1797,12 +1605,14 @@ def check_variation_descriptor(actual, test, check_descriptor=None, nested=False assert len(actual["expressions"]) == len(test["expressions"]) for expression in test["expressions"]: assert expression in actual["expressions"] + return check_variation_descriptor @pytest.fixture(scope="session") def check_descriptor(): """Test fixture to compare gene, therapy, and disease descriptors.""" + def check_descriptor(actual, test): """Check that gene, therapy, and disease descriptors match.""" actual_keys = actual.keys() @@ -1818,8 +1628,13 @@ def check_descriptor(actual, test): assert len(actual["extensions"]) == 1 actual_ext = actual["extensions"][0] test_ext = test["extensions"][0] - assert actual_ext["value"]["approval_rating"] == test_ext["value"]["approval_rating"] # noqa: E501 - assert len(actual_ext["value"]["has_indications"]) == len(test_ext["value"]["has_indications"]) # noqa: E501 + assert ( + actual_ext["value"]["approval_rating"] + == test_ext["value"]["approval_rating"] + ) + assert len(actual_ext["value"]["has_indications"]) == len( + test_ext["value"]["has_indications"] + ) for x in test_ext["value"]["has_indications"]: assert x in actual_ext["value"]["has_indications"], x else: @@ -1827,21 +1642,25 @@ def check_descriptor(actual, test): assert x in actual[key], x else: assert actual[key] == test[key] + return check_descriptor @pytest.fixture(scope="session") def check_method(): """Create a test fixture to compare methods.""" + def check_method(actual, test): """Check that methods match.""" assert actual == test + return check_method @pytest.fixture(scope="session") def check_document(): """Create a test fixture to compare documents.""" + def check_document(actual, test): """Check that documents match.""" actual_keys = actual.keys() @@ -1853,35 +1672,51 @@ def check_document(actual, test): assert set(actual[key]) == set(test[key]) else: assert actual == test + return check_document @pytest.fixture(scope="session") def check_transformed_cdm(): """Test fixture to compare CDM transformations.""" - def check_transformed_cdm(data, statements, propositions, - variation_descriptors, gene_descriptors, - disease_descriptors, therapy_descriptors, - civic_methods, documents, check_statement, - check_proposition, check_variation_descriptor, - check_descriptor, check_document, check_method, - transformed_file): + + def check_transformed_cdm( + data, + statements, + propositions, + variation_descriptors, + gene_descriptors, + disease_descriptors, + therapy_descriptors, + civic_methods, + documents, + check_statement, + check_proposition, + check_variation_descriptor, + check_descriptor, + check_document, + check_method, + transformed_file, + ): """Test that transform to CDM works correctly.""" tests = ( (data["statements"], statements, check_statement), (data["propositions"], propositions, check_proposition), - (data["variation_descriptors"], variation_descriptors, - check_variation_descriptor), + ( + data["variation_descriptors"], + variation_descriptors, + check_variation_descriptor, + ), (data["gene_descriptors"], gene_descriptors, check_descriptor), - (data["disease_descriptors"], disease_descriptors, - check_descriptor), + (data["disease_descriptors"], disease_descriptors, check_descriptor), (data["methods"], civic_methods, check_method), - (data["documents"], documents, check_document) + (data["documents"], documents, check_document), ) if therapy_descriptors: - tests += (data["therapy_descriptors"], therapy_descriptors, - check_descriptor), + tests += ( + (data["therapy_descriptors"], therapy_descriptors, check_descriptor), + ) for actual_data, test_data, test_fixture in tests: assert len(actual_data) == len(test_data) @@ -1896,6 +1731,7 @@ def check_transformed_cdm(data, statements, propositions, assert checked_id == test_id, f"{actual_id} does not match expected" os.remove(transformed_file) + return check_transformed_cdm diff --git a/tests/unit/database/test_database.py b/tests/unit/database/test_database.py index 8df819e5..b598a9dc 100644 --- a/tests/unit/database/test_database.py +++ b/tests/unit/database/test_database.py @@ -1,7 +1,7 @@ """Validate property and relationship rules for graph DB.""" -import pytest from typing import Optional +import pytest from metakb.database import Graph @@ -16,6 +16,7 @@ def graph(): @pytest.fixture(scope="session") def check_unique_property(graph: Graph): """Verify that IDs are unique""" + def _check_function(label: str, property: str): query = f""" MATCH (x:{label}) @@ -27,12 +28,14 @@ def _check_function(label: str, property: str): record = s.run(query).single() assert record.values()[0] == 0 + return _check_function @pytest.fixture(scope="session") def check_single_label(graph: Graph): """Check that nodes don't contain additional labels""" + def _check_function(label: str): query = f""" MATCH (a:{label}) @@ -42,6 +45,7 @@ def _check_function(label: str): with graph.driver.session() as s: record = s.run(query).single() assert record.values()[0] == 0 + return _check_function @@ -50,6 +54,7 @@ def check_descriptor_count(graph: Graph, sources_count: int): """Check that value contains no more than 1 descriptor for each source, and at least 1 descriptor overall. """ + def _check_function(label: str, max_descriptors: int = sources_count): query = f""" MATCH (a:{label}) @@ -61,12 +66,14 @@ def _check_function(label: str, max_descriptors: int = sources_count): with graph.driver.session() as s: record = s.run(query).single() assert record.values()[0] == 0 + return _check_function @pytest.fixture(scope="session") def check_describes_count(graph: Graph): """Check that descriptor only describes 1 value object""" + def _check_function(label: str): query = f""" MATCH (d:{label}Descriptor) @@ -78,6 +85,7 @@ def _check_function(label: str): with graph.driver.session() as s: record = s.run(query).single() assert record.values()[0] == 0 + return _check_function @@ -87,6 +95,7 @@ def check_proposition_relation(graph: Graph): Provided relation value should be coming from the proposition, ie one of {"HAS_SUBJECT", "HAS_OBJECT", "HAS_OBJECT_QUALIFIER"} """ + def _check_function(label: str, relation: str): query = f""" MATCH (v:{label}) @@ -96,12 +105,14 @@ def _check_function(label: str, relation: str): with graph.driver.session() as s: record = s.run(query).single() assert record.values()[0] == 0 + return _check_function @pytest.fixture(scope="session") def check_statement_relation(graph: Graph): """Check that descriptor is used in a statement.""" + def _check_function(value_label: str): query = f""" MATCH (d:{value_label}Descriptor) @@ -113,6 +124,7 @@ def _check_function(value_label: str): with graph.driver.session() as s: record = s.run(query).single() assert record.values()[0] == 0 + return _check_function @@ -121,9 +133,15 @@ def check_relation_count(graph: Graph): """Check that the quantity of relationships from one Node type to another are within a certain range. """ - def _check_function(self_label: str, other_label: str, relation: str, - min: int = 1, max: Optional[int] = 1, - direction: Optional[str] = "out"): + + def _check_function( + self_label: str, + other_label: str, + relation: str, + min: int = 1, + max: Optional[int] = 1, + direction: Optional[str] = "out", + ): if direction == "out": rel_query = f"-[:{relation}]->" elif direction == "in": @@ -143,59 +161,75 @@ def _check_function(self_label: str, other_label: str, relation: str, with graph.driver.session() as s: record = s.run(query).single() assert record.values()[0] == 0 + return _check_function -def test_gene_rules(check_unique_property, check_single_label, - check_descriptor_count): +def test_gene_rules(check_unique_property, check_single_label, check_descriptor_count): """Verify property and relationship rules for Gene nodes.""" check_unique_property("Gene", "id") check_single_label("Gene") check_descriptor_count("Gene") -def test_gene_descriptor_rules(check_unique_property, check_single_label, - check_describes_count): +def test_gene_descriptor_rules( + check_unique_property, check_single_label, check_describes_count +): """Verify property and relationship rules for GeneDescriptor nodes.""" check_unique_property("GeneDescriptor", "id") check_single_label("GeneDescriptor") check_describes_count("Gene") -def test_variation_rules(graph, check_unique_property, check_descriptor_count, - check_proposition_relation): +def test_variation_rules( + graph, check_unique_property, check_descriptor_count, check_proposition_relation +): """Verify property and relationship rules for Variation nodes.""" check_unique_property("Variation", "id") check_descriptor_count("Variation", 4) check_proposition_relation("Variation", "HAS_SUBJECT") -def test_variation_descriptor_rules(check_unique_property, check_single_label, - check_describes_count, - check_statement_relation, - check_relation_count): +def test_variation_descriptor_rules( + check_unique_property, + check_single_label, + check_describes_count, + check_statement_relation, + check_relation_count, +): """Verify property and relationship rules for VariationDescriptor nodes.""" check_unique_property("VariationDescriptor", "id") check_single_label("VariationDescriptor") check_describes_count("Variation") check_statement_relation("Variation") check_relation_count("VariationDescriptor", "GeneDescriptor", "HAS_GENE") - check_relation_count("VariationDescriptor", "VariationGroup", - "IN_VARIATION_GROUP", min=0, max=1) + check_relation_count( + "VariationDescriptor", "VariationGroup", "IN_VARIATION_GROUP", min=0, max=1 + ) -def test_variation_group_rules(check_unique_property, check_single_label, - check_relation_count): +def test_variation_group_rules( + check_unique_property, check_single_label, check_relation_count +): """Verify property and relationship rules for VariationDescriptor nodes.""" check_unique_property("VariationGroup", "id") check_single_label("VariationGroup") - check_relation_count("VariationGroup", "VariationDescriptor", - "IN_VARIATION_GROUP", max=None, direction="in") + check_relation_count( + "VariationGroup", + "VariationDescriptor", + "IN_VARIATION_GROUP", + max=None, + direction="in", + ) -def test_therapy_rules(check_unique_property, check_single_label, - check_proposition_relation, check_descriptor_count, - sources_count): +def test_therapy_rules( + check_unique_property, + check_single_label, + check_proposition_relation, + check_descriptor_count, + sources_count, +): """Verify property and relationship rules for Therapy nodes.""" check_unique_property("Therapy", "id") check_single_label("Therapy") @@ -204,9 +238,12 @@ def test_therapy_rules(check_unique_property, check_single_label, check_descriptor_count("Therapy", sources_count + 1) -def test_therapy_descriptor_rules(check_unique_property, check_single_label, - check_describes_count, - check_statement_relation): +def test_therapy_descriptor_rules( + check_unique_property, + check_single_label, + check_describes_count, + check_statement_relation, +): """Verify property and relationship rules for TherapyDescriptor nodes.""" check_unique_property("TherapyDescriptor", "id") check_single_label("TherapyDescriptor") @@ -214,9 +251,13 @@ def test_therapy_descriptor_rules(check_unique_property, check_single_label, check_statement_relation("Therapy") -def test_disease_rules(check_unique_property, check_single_label, - check_proposition_relation, check_descriptor_count, - sources_count): +def test_disease_rules( + check_unique_property, + check_single_label, + check_proposition_relation, + check_descriptor_count, + sources_count, +): """Verify property and relationship rules for disease nodes.""" check_unique_property("Disease", "id") check_single_label("Disease") @@ -225,9 +266,12 @@ def test_disease_rules(check_unique_property, check_single_label, check_descriptor_count("Disease", sources_count + 1) -def test_disease_descriptor_rules(check_unique_property, check_single_label, - check_describes_count, - check_statement_relation): +def test_disease_descriptor_rules( + check_unique_property, + check_single_label, + check_describes_count, + check_statement_relation, +): """Verify property and relationship rules for DiseaseDescriptor nodes.""" check_unique_property("DiseaseDescriptor", "id") check_single_label("DiseaseDescriptor") @@ -235,17 +279,20 @@ def test_disease_descriptor_rules(check_unique_property, check_single_label, check_statement_relation("Disease") -def test_statement_rules(graph: Graph, check_unique_property, - check_single_label, check_descriptor_count, - check_relation_count): +def test_statement_rules( + graph: Graph, + check_unique_property, + check_single_label, + check_descriptor_count, + check_relation_count, +): """Verify property and relationship rules for Statement nodes.""" check_unique_property("Statement", "id") check_single_label("Statement") check_relation_count("Statement", "VariationDescriptor", "HAS_VARIATION") check_relation_count("Statement", "DiseaseDescriptor", "HAS_DISEASE") - check_relation_count("Statement", "TherapyDescriptor", "HAS_THERAPY", - min=0) + check_relation_count("Statement", "TherapyDescriptor", "HAS_THERAPY", min=0) check_relation_count("Statement", "Proposition", "DEFINED_BY") check_relation_count("Statement", "Method", "USES_METHOD") @@ -301,22 +348,20 @@ def test_proposition_rules(graph, check_unique_property): assert record.values()[0] == 0 -def test_document_rules(check_unique_property, check_single_label, - check_relation_count): +def test_document_rules( + check_unique_property, check_single_label, check_relation_count +): """Verify property and relationship rules for Document nodes.""" check_unique_property("Document", "id") check_single_label("Document") - check_relation_count("Document", "Statement", "CITES", max=None, - direction="in") + check_relation_count("Document", "Statement", "CITES", max=None, direction="in") -def test_method_rules(check_unique_property, check_single_label, - check_relation_count): +def test_method_rules(check_unique_property, check_single_label, check_relation_count): """Verify property and relationship rules for Method nodes.""" check_unique_property("Method", "id") check_single_label("Method") - check_relation_count("Method", "Statement", "USES_METHOD", max=None, - direction="in") + check_relation_count("Method", "Statement", "USES_METHOD", max=None, direction="in") def test_no_lost_nodes(graph: Graph): diff --git a/tests/unit/deltas/test_civic_deltas.py b/tests/unit/deltas/test_civic_deltas.py index 8a5b9231..f61238cb 100644 --- a/tests/unit/deltas/test_civic_deltas.py +++ b/tests/unit/deltas/test_civic_deltas.py @@ -5,136 +5,86 @@ import pytest from civicpy.__version__ import __version__ as civicpy_version - -from metakb import PROJECT_ROOT, APP_ROOT +from metakb import APP_ROOT, PROJECT_ROOT from metakb.delta import Delta from metakb.version import __version__ - -MAIN_JSON = PROJECT_ROOT / 'tests' / 'data' / 'deltas' / 'main_civic.json' -UPDATED_JSON = \ - PROJECT_ROOT / 'tests' / 'data' / 'deltas' / 'updated_civic.json' +MAIN_JSON = PROJECT_ROOT / "tests" / "data" / "deltas" / "main_civic.json" +UPDATED_JSON = PROJECT_ROOT / "tests" / "data" / "deltas" / "updated_civic.json" -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def civic(): """Create CIViC Delta test fixture.""" - return Delta(MAIN_JSON, 'civic', _updated_json=UPDATED_JSON) + return Delta(MAIN_JSON, "civic", _updated_json=UPDATED_JSON) -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def main_data(): """Create main_data test fixture.""" - with open(MAIN_JSON, 'r') as f: + with open(MAIN_JSON, "r") as f: main_data = json.load(f) return main_data -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def updated_data(): """Create updated_data test fixture.""" - with open(UPDATED_JSON, 'r') as f: + with open(UPDATED_JSON, "r") as f: updated_data = json.load(f) return updated_data -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def diff(): """Create a test fixture for CIViC deltas.""" return { - '_meta': { - 'civicpy_version': civicpy_version, - 'metakb_version': __version__, - 'date_harvested': date.today().strftime('%Y%m%d') + "_meta": { + "civicpy_version": civicpy_version, + "metakb_version": __version__, + "date_harvested": date.today().strftime("%Y%m%d"), + }, + "genes": { + "DELETE": [{"id": 3, "name": "test_remove"}], + "INSERT": [], + "UPDATE": [{"2778": {"aliases": {"$insert": [(1, "MIF2")]}}}], }, - 'genes': { - 'DELETE': [ - { - "id": 3, - "name": "test_remove" - } - ], - 'INSERT': [], - 'UPDATE': [ - { - '2778': { - 'aliases': { - '$insert': [ - (1, 'MIF2') - ] - } - - } - } - ] + "variants": { + "DELETE": [], + "INSERT": [], + "UPDATE": [{"27": {"$delete": ["entrez_name"]}}], }, - 'variants': { - 'DELETE': [], - 'INSERT': [], - 'UPDATE': [ - { - '27': { - '$delete': ['entrez_name'] - } - } - ] + "assertions": { + "DELETE": [], + "INSERT": [{"id": 1, "description": "description"}], + "UPDATE": [], }, - 'assertions': { - 'DELETE': [], - 'INSERT': [ - { - "id": 1, - "description": "description" - } - ], - 'UPDATE': [] + "evidence": { + "INSERT": [], + "DELETE": [], + "UPDATE": [{"358": {"variant_origin": "Somatic"}}], }, - 'evidence': { - 'INSERT': [], - 'DELETE': [], - 'UPDATE': [ - { - "358": {"variant_origin": "Somatic"} - } - ] - } } -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def delta(): """Create empty delta test fixture.""" return { - 'genes': { - 'DELETE': [], - 'INSERT': [], - 'UPDATE': [] - }, - 'variants': { - 'DELETE': [], - 'INSERT': [], - 'UPDATE': [] - }, - 'evidence': { - 'DELETE': [], - 'INSERT': [], - 'UPDATE': [] - }, - 'assertions': { - 'DELETE': [], - 'INSERT': [], - 'UPDATE': [] - } + "genes": {"DELETE": [], "INSERT": [], "UPDATE": []}, + "variants": {"DELETE": [], "INSERT": [], "UPDATE": []}, + "evidence": {"DELETE": [], "INSERT": [], "UPDATE": []}, + "assertions": {"DELETE": [], "INSERT": [], "UPDATE": []}, } def test_init(): """Test that init is correct.""" - cd = Delta(MAIN_JSON, 'civic') + cd = Delta(MAIN_JSON, "civic") assert cd._main_json == MAIN_JSON assert cd._updated_json is None - cd = Delta(MAIN_JSON, 'civic', _updated_json=UPDATED_JSON) + cd = Delta(MAIN_JSON, "civic", _updated_json=UPDATED_JSON) assert cd._main_json == MAIN_JSON assert cd._updated_json == UPDATED_JSON @@ -144,10 +94,15 @@ def test_compute_delta(civic, diff): assert civic.compute_delta() == diff # Test when _updated_json is not in kwargs - cd = Delta(MAIN_JSON, 'civic') + cd = Delta(MAIN_JSON, "civic") cd.compute_delta() - fn = APP_ROOT / 'data' / 'civic' / 'harvester' / \ - f"civic_harvester_{date.today().strftime('%Y%m%d')}.json" + fn = ( + APP_ROOT + / "data" + / "civic" + / "harvester" + / f"civic_harvester_{date.today().strftime('%Y%m%d')}.json" + ) assert fn.exists() os.remove(fn) assert not fn.exists() @@ -155,48 +110,49 @@ def test_compute_delta(civic, diff): def test_ins_del_delta(civic, diff, main_data, updated_data, delta): """Test that _ins_del_delta method is correct.""" - civic._ins_del_delta(delta, 'genes', 'DELETE', [3], main_data['genes']) - assert delta['genes']['DELETE'] == diff['genes']['DELETE'] + civic._ins_del_delta(delta, "genes", "DELETE", [3], main_data["genes"]) + assert delta["genes"]["DELETE"] == diff["genes"]["DELETE"] - civic._ins_del_delta(delta, 'assertions', 'INSERT', [1], - updated_data['assertions']) - assert delta['assertions']['INSERT'] == diff['assertions']['INSERT'] + civic._ins_del_delta(delta, "assertions", "INSERT", [1], updated_data["assertions"]) + assert delta["assertions"]["INSERT"] == diff["assertions"]["INSERT"] def test_update_delta(civic, diff, delta, updated_data, main_data): """Test that _update_delta method is correct.""" - civic._update_delta(delta, 'genes', updated_data['genes'], - main_data['genes']) - assert delta['genes']['UPDATE'] == diff['genes']['UPDATE'] + civic._update_delta(delta, "genes", updated_data["genes"], main_data["genes"]) + assert delta["genes"]["UPDATE"] == diff["genes"]["UPDATE"] - civic._update_delta(delta, 'variants', updated_data['variants'], - main_data['variants']) - assert delta['variants']['UPDATE'] == diff['variants']['UPDATE'] + civic._update_delta( + delta, "variants", updated_data["variants"], main_data["variants"] + ) + assert delta["variants"]["UPDATE"] == diff["variants"]["UPDATE"] - civic._update_delta(delta, 'evidence', updated_data['evidence'], - main_data['evidence']) - assert delta['evidence']['UPDATE'] == diff['evidence']['UPDATE'] + civic._update_delta( + delta, "evidence", updated_data["evidence"], main_data["evidence"] + ) + assert delta["evidence"]["UPDATE"] == diff["evidence"]["UPDATE"] def test_get_ids(civic, main_data, updated_data): """Test that _get_ids method is correct.""" - assert len(civic._get_ids(main_data['assertions'])) == 0 - assert len(civic._get_ids(main_data['variants'])) == 1 - assert len(civic._get_ids(main_data['genes'])) == 2 - assert len(civic._get_ids(main_data['evidence'])) == 1 + assert len(civic._get_ids(main_data["assertions"])) == 0 + assert len(civic._get_ids(main_data["variants"])) == 1 + assert len(civic._get_ids(main_data["genes"])) == 2 + assert len(civic._get_ids(main_data["evidence"])) == 1 - assert len(civic._get_ids(updated_data['assertions'])) == 1 - assert len(civic._get_ids(updated_data['variants'])) == 1 - assert len(civic._get_ids(updated_data['genes'])) == 1 - assert len(civic._get_ids(updated_data['evidence'])) == 1 + assert len(civic._get_ids(updated_data["assertions"])) == 1 + assert len(civic._get_ids(updated_data["variants"])) == 1 + assert len(civic._get_ids(updated_data["genes"])) == 1 + assert len(civic._get_ids(updated_data["evidence"])) == 1 def test_create_json(civic, diff): """Test that _create_json method is correct.""" - test_date = '19980108' + test_date = "19980108" civic._create_json(diff, test_date) - file_name = APP_ROOT / 'data' / 'civic' / 'delta' / f'civic_deltas' \ - f'_{test_date}.json' + file_name = ( + APP_ROOT / "data" / "civic" / "delta" / f"civic_deltas" f"_{test_date}.json" + ) assert file_name.exists() os.remove(file_name) assert not file_name.exists() diff --git a/tests/unit/deltas/test_moa_deltas.py b/tests/unit/deltas/test_moa_deltas.py index 8cc41e53..0f6c15cf 100644 --- a/tests/unit/deltas/test_moa_deltas.py +++ b/tests/unit/deltas/test_moa_deltas.py @@ -1,138 +1,110 @@ """Test MOAlmanac deltas.""" -from datetime import date import json import os +from datetime import date import pytest - -from metakb import PROJECT_ROOT, APP_ROOT +from metakb import APP_ROOT, PROJECT_ROOT from metakb.delta import Delta from metakb.version import __version__ - -MAIN_JSON = PROJECT_ROOT / 'tests' / 'data' / 'deltas' / 'main_moa.json' -UPDATED_JSON = \ - PROJECT_ROOT / 'tests' / 'data' / 'deltas' / 'updated_moa.json' +MAIN_JSON = PROJECT_ROOT / "tests" / "data" / "deltas" / "main_moa.json" +UPDATED_JSON = PROJECT_ROOT / "tests" / "data" / "deltas" / "updated_moa.json" -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def moa(): """Create MOAlmanac Delta test fixture.""" - return Delta(MAIN_JSON, 'moa', _updated_json=UPDATED_JSON) + return Delta(MAIN_JSON, "moa", _updated_json=UPDATED_JSON) -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def main_data(): """Create main_data test fixture.""" - with open(MAIN_JSON, 'r') as f: + with open(MAIN_JSON, "r") as f: main_data = json.load(f) return main_data -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def updated_data(): """Create updated_data test fixture.""" - with open(UPDATED_JSON, 'r') as f: + with open(UPDATED_JSON, "r") as f: updated_data = json.load(f) return updated_data -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def diff(): """Create a test fixture for MOAlmanac deltas.""" return { - '_meta': { - 'metakb_version': __version__, - 'date_harvested': date.today().strftime('%Y%m%d'), - 'moa_api_version': '0.2' + "_meta": { + "metakb_version": __version__, + "date_harvested": date.today().strftime("%Y%m%d"), + "moa_api_version": "0.2", }, - 'assertions': { - 'DELETE': [], - 'INSERT': [], - 'UPDATE': [ + "assertions": { + "DELETE": [], + "INSERT": [], + "UPDATE": [ { - '3': { - 'disease': { - 'oncotree_code': 'ALL', - 'oncotree_term': 'Acute Lymphoid Leukemia' + "3": { + "disease": { + "oncotree_code": "ALL", + "oncotree_term": "Acute Lymphoid Leukemia", }, - 'created_on': '01/16/21', - '$delete': ['test update delete'] + "created_on": "01/16/21", + "$delete": ["test update delete"], } } - ] - }, - 'sources': { - 'DELETE': [], - 'INSERT': [ - { - 'id': 22, - 'type': 'Journal', - 'assertion_id': [30, 288], - 'doi': '10.1371/journal.pgen.1004135', - 'nct': None, - 'pmid': '24550739', - 'url': 'https://doi.org/10.1371/journal.pgen.1004135', - 'citation': 'Borad MJ, Champion MD, Egan JB, et al. ' - 'Integrated genomic characterization reveals ' - 'novel, therapeutically relevant drug targets' - ' in FGFR and EGFR pathways in sporadic ' - 'intrahepatic cholangiocarcinoma. PLoS Genet.' - ' 2014;10(2):e1004135.' - } ], - 'UPDATE': [ - { - '2': { - 'assertion_id': { - '$insert': [(1, 3)] - } - } - } - ] }, - 'variants': { - 'DELETE': [ + "sources": { + "DELETE": [], + "INSERT": [ { - 'id': 5, - 'feature_type': 'test_removal' + "id": 22, + "type": "Journal", + "assertion_id": [30, 288], + "doi": "10.1371/journal.pgen.1004135", + "nct": None, + "pmid": "24550739", + "url": "https://doi.org/10.1371/journal.pgen.1004135", + "citation": "Borad MJ, Champion MD, Egan JB, et al. " + "Integrated genomic characterization reveals " + "novel, therapeutically relevant drug targets" + " in FGFR and EGFR pathways in sporadic " + "intrahepatic cholangiocarcinoma. PLoS Genet." + " 2014;10(2):e1004135.", } ], - 'INSERT': [], - 'UPDATE': [] - } + "UPDATE": [{"2": {"assertion_id": {"$insert": [(1, 3)]}}}], + }, + "variants": { + "DELETE": [{"id": 5, "feature_type": "test_removal"}], + "INSERT": [], + "UPDATE": [], + }, } -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def delta(): """Create empty delta test fixture.""" return { - 'variants': { - 'DELETE': [], - 'INSERT': [], - 'UPDATE': [] - }, - 'sources': { - 'DELETE': [], - 'INSERT': [], - 'UPDATE': [] - }, - 'assertions': { - 'DELETE': [], - 'INSERT': [], - 'UPDATE': [] - } + "variants": {"DELETE": [], "INSERT": [], "UPDATE": []}, + "sources": {"DELETE": [], "INSERT": [], "UPDATE": []}, + "assertions": {"DELETE": [], "INSERT": [], "UPDATE": []}, } def test_init(): """Test that init is correct.""" - moad = Delta(MAIN_JSON, 'moa') + moad = Delta(MAIN_JSON, "moa") assert moad._main_json == MAIN_JSON assert moad._updated_json is None - moad = Delta(MAIN_JSON, 'moa', _updated_json=UPDATED_JSON) + moad = Delta(MAIN_JSON, "moa", _updated_json=UPDATED_JSON) assert moad._main_json == MAIN_JSON assert moad._updated_json == UPDATED_JSON @@ -142,10 +114,15 @@ def test_compute_delta(moa, diff): assert moa.compute_delta() == diff # Test when _updated_json is not in kwargs - moad = Delta(MAIN_JSON, 'moa') + moad = Delta(MAIN_JSON, "moa") moad.compute_delta() - fn = APP_ROOT / 'data' / 'moa' / 'harvester' /\ - f"moa_harvester_{date.today().strftime('%Y%m%d')}.json" + fn = ( + APP_ROOT + / "data" + / "moa" + / "harvester" + / f"moa_harvester_{date.today().strftime('%Y%m%d')}.json" + ) assert fn.exists() os.remove(fn) assert not fn.exists() @@ -153,42 +130,40 @@ def test_compute_delta(moa, diff): def test_ins_del_delta(moa, diff, main_data, updated_data, delta): """Test that _ins_del_delta method is correct.""" - moa._ins_del_delta(delta, 'variants', 'DELETE', [5], main_data['variants']) - assert delta['variants']['DELETE'] == diff['variants']['DELETE'] + moa._ins_del_delta(delta, "variants", "DELETE", [5], main_data["variants"]) + assert delta["variants"]["DELETE"] == diff["variants"]["DELETE"] - moa._ins_del_delta(delta, 'sources', 'INSERT', [22], - updated_data['sources']) - assert delta['sources']['INSERT'] == diff['sources']['INSERT'] + moa._ins_del_delta(delta, "sources", "INSERT", [22], updated_data["sources"]) + assert delta["sources"]["INSERT"] == diff["sources"]["INSERT"] def test_update_delta(moa, diff, delta, updated_data, main_data): """Test that _update_delta method is correct.""" - moa._update_delta(delta, 'assertions', updated_data['assertions'], - main_data['assertions']) - assert delta['assertions']['UPDATE'] == diff['assertions']['UPDATE'] + moa._update_delta( + delta, "assertions", updated_data["assertions"], main_data["assertions"] + ) + assert delta["assertions"]["UPDATE"] == diff["assertions"]["UPDATE"] - moa._update_delta(delta, 'sources', updated_data['sources'], - main_data['sources']) - assert delta['sources']['UPDATE'] == diff['sources']['UPDATE'] + moa._update_delta(delta, "sources", updated_data["sources"], main_data["sources"]) + assert delta["sources"]["UPDATE"] == diff["sources"]["UPDATE"] def test_get_ids(moa, main_data, updated_data): """Test that _get_ids method is correct.""" - assert len(moa._get_ids(main_data['assertions'])) == 1 - assert len(moa._get_ids(main_data['variants'])) == 3 - assert len(moa._get_ids(main_data['sources'])) == 1 + assert len(moa._get_ids(main_data["assertions"])) == 1 + assert len(moa._get_ids(main_data["variants"])) == 3 + assert len(moa._get_ids(main_data["sources"])) == 1 - assert len(moa._get_ids(updated_data['assertions'])) == 1 - assert len(moa._get_ids(updated_data['variants'])) == 2 - assert len(moa._get_ids(updated_data['sources'])) == 2 + assert len(moa._get_ids(updated_data["assertions"])) == 1 + assert len(moa._get_ids(updated_data["variants"])) == 2 + assert len(moa._get_ids(updated_data["sources"])) == 2 def test_create_json(moa, diff): """Test that _create_json method is correct.""" - test_date = '19980108' + test_date = "19980108" moa._create_json(diff, test_date) - file_name = APP_ROOT / 'data' / 'moa' / 'delta' / f'moa_deltas_' \ - f'{test_date}.json' + file_name = APP_ROOT / "data" / "moa" / "delta" / f"moa_deltas_" f"{test_date}.json" assert file_name.exists() os.remove(file_name) assert not file_name.exists() diff --git a/tests/unit/harvesters/moa/test_moa_assertions.py b/tests/unit/harvesters/moa/test_moa_assertions.py index 0f3135b1..500df771 100644 --- a/tests/unit/harvesters/moa/test_moa_assertions.py +++ b/tests/unit/harvesters/moa/test_moa_assertions.py @@ -2,10 +2,9 @@ import json import pytest -from mock import patch - -from metakb import PROJECT_ROOT # noqa: I202 +from metakb import PROJECT_ROOT from metakb.harvesters import MOAHarvester +from mock import patch @pytest.fixture(scope="module") @@ -15,12 +14,12 @@ def assertion170(): "id": 170, "context": "", "description": "Administration of bevacizumab in a dabrafenib-resistant cell " - "line counteracted the tumor growth stimulating effect of " - "administering dabrafenib post-resistance.", + "line counteracted the tumor growth stimulating effect of " + "administering dabrafenib post-resistance.", "disease": { "name": "Melanoma", "oncotree_code": "MEL", - "oncotree_term": "Melanoma" + "oncotree_term": "Melanoma", }, "therapy_name": "Bevacizumab", "therapy_type": "Targeted therapy", @@ -46,30 +45,26 @@ def assertion170(): "rsid": "rs113488022", "start_position": "140453136", "variant_annotation": "Missense", - "feature": "BRAF p.V600E (Missense)" - } + "feature": "BRAF p.V600E (Missense)", + }, } @patch.object(MOAHarvester, "_get_all_variants") @patch.object(MOAHarvester, "_get_all_assertions") -def test_assertion_170(test_get_all_assertions, test_get_all_variants, - assertion170): +def test_assertion_170(test_get_all_assertions, test_get_all_variants, assertion170): """Test moa harvester works correctly for assertions.""" - with open(f"{PROJECT_ROOT}/tests/data/" - f"harvesters/moa/assertions.json") as f: + with open(f"{PROJECT_ROOT}/tests/data/" f"harvesters/moa/assertions.json") as f: data = json.load(f) test_get_all_assertions.return_value = data - with open(f"{PROJECT_ROOT}/tests/data/" - f"harvesters/moa/variants.json") as f: + with open(f"{PROJECT_ROOT}/tests/data/" f"harvesters/moa/variants.json") as f: data = json.load(f) test_get_all_variants.return_value = data assertion_resp = MOAHarvester()._get_all_assertions() _, variants_list = MOAHarvester().harvest_variants() - assertions = MOAHarvester().harvest_assertions( - assertion_resp, variants_list) + assertions = MOAHarvester().harvest_assertions(assertion_resp, variants_list) actual = None for a in assertions: diff --git a/tests/unit/harvesters/moa/test_moa_harvest.py b/tests/unit/harvesters/moa/test_moa_harvest.py index c7044254..09c09390 100644 --- a/tests/unit/harvesters/moa/test_moa_harvest.py +++ b/tests/unit/harvesters/moa/test_moa_harvest.py @@ -1,14 +1,15 @@ """Test MOAlmanac Harvester.""" -from metakb.harvesters import MOAHarvester -from metakb import APP_ROOT import os +from metakb import APP_ROOT +from metakb.harvesters import MOAHarvester + def test_harvest(): """Test MOAlmanac harvest method.""" - fn = 'test_moa_harvester.json' + fn = "test_moa_harvester.json" assert MOAHarvester().harvest(filename=fn) - file_path = APP_ROOT / 'data' / 'moa' / 'harvester' / fn + file_path = APP_ROOT / "data" / "moa" / "harvester" / fn assert file_path.exists() os.remove(file_path) assert not file_path.exists() diff --git a/tests/unit/harvesters/moa/test_moa_source.py b/tests/unit/harvesters/moa/test_moa_source.py index 20b665da..aa9848be 100644 --- a/tests/unit/harvesters/moa/test_moa_source.py +++ b/tests/unit/harvesters/moa/test_moa_source.py @@ -1,11 +1,10 @@ """Test MOAlmanac source""" import json -from mock import patch import pytest - -from metakb import PROJECT_ROOT # noqa: I202 +from metakb import PROJECT_ROOT from metakb.harvesters import MOAHarvester +from mock import patch @pytest.fixture(scope="module") @@ -27,17 +26,16 @@ def source68(): "pmid": 27532019, "url": "https://doi.org/10.1186/s40425-016-0148-7", "citation": "Amin A, Lawson DH, Salama AK, et al. Phase II " - "study of vemurafenib followed by ipilimumab in patients " - "with previously untreated BRAF-mutated metastatic " - "melanoma. J Immunother Cancer. 2016;4:44." + "study of vemurafenib followed by ipilimumab in patients " + "with previously untreated BRAF-mutated metastatic " + "melanoma. J Immunother Cancer. 2016;4:44.", } @patch.object(MOAHarvester, "_get_all_assertions") def test_source68(test_get_all_assertions, source68): """Test moa harvester works correctly for evidence.""" - with open(f"{PROJECT_ROOT}/tests/data/" - f"harvesters/moa/assertions.json") as f: + with open(f"{PROJECT_ROOT}/tests/data/" f"harvesters/moa/assertions.json") as f: data = json.load(f) test_get_all_assertions.return_value = data diff --git a/tests/unit/harvesters/oncokb/test_oncokb_harvest.py b/tests/unit/harvesters/oncokb/test_oncokb_harvest.py index 8a8e0cd3..7d974cca 100644 --- a/tests/unit/harvesters/oncokb/test_oncokb_harvest.py +++ b/tests/unit/harvesters/oncokb/test_oncokb_harvest.py @@ -1,21 +1,34 @@ """Test OncoKB Harvester""" import os -from metakb.harvesters import OncoKBHarvester from metakb import APP_ROOT, PROJECT_ROOT +from metakb.harvesters import OncoKBHarvester def test_harvest(): """Test OncoKB harvest method""" - ONCOKB_API_TOKEN = os.environ.get("ONCOKB_API_TOKEN") - o = OncoKBHarvester(api_token=ONCOKB_API_TOKEN) + api_token = os.environ.get("ONCOKB_API_TOKEN") + o = OncoKBHarvester(api_token=api_token) assert not o.harvest("") fn = "test_oncokb_harvester.json" - variants_by_protein_change_path = PROJECT_ROOT / "tests" / "data" / "harvesters" / \ - "oncokb" / "variants_by_protein_change.csv" + variants_by_protein_change_path = ( + PROJECT_ROOT + / "tests" + / "data" + / "harvesters" + / "oncokb" + / "variants_by_protein_change.csv" + ) assert o.harvest(variants_by_protein_change_path, fn) - for var in [o.genes, o.variants, o.metadata, o.diagnostic_levels, - o.prognostic_levels, o.sensitive_levels, o.sensitive_levels]: + for var in [ + o.genes, + o.variants, + o.metadata, + o.diagnostic_levels, + o.prognostic_levels, + o.sensitive_levels, + o.sensitive_levels, + ]: assert var file_path = APP_ROOT / "data" / "oncokb" / "harvester" / fn assert file_path.exists() diff --git a/tests/unit/harvesters/test_base_class.py b/tests/unit/harvesters/test_base_class.py index 41bfd308..bfd98451 100644 --- a/tests/unit/harvesters/test_base_class.py +++ b/tests/unit/harvesters/test_base_class.py @@ -1,9 +1,9 @@ -"""This module tests the Harvester base class.""" -from metakb.harvesters import base +"""Tests the Harvester base class.""" import pytest +from metakb.harvesters import base -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def bh(): """Create a base Harvester fixture for testing.""" bh = base.Harvester() diff --git a/tests/unit/harvesters/test_civic_harvester.py b/tests/unit/harvesters/test_civic_harvester.py index 838222f0..8b15bb7d 100644 --- a/tests/unit/harvesters/test_civic_harvester.py +++ b/tests/unit/harvesters/test_civic_harvester.py @@ -1,13 +1,11 @@ """Test CIViC Harvester class""" -import os import json +import os import pytest - -from metakb import PROJECT_ROOT, APP_ROOT +from metakb import APP_ROOT, PROJECT_ROOT from metakb.harvesters import CIViCHarvester - TEST_DATA_PATH = PROJECT_ROOT / "tests" / "data" / "harvesters" / "civic" TEST_CIVICPY_CACHE_PATH = list(sorted(TEST_DATA_PATH.glob("civicpy_cache_*.pkl")))[-1] @@ -85,7 +83,7 @@ def civic_aid_7(): def test_harvest(harvester): """Test that CIViC harvest method works correctly""" - fn = 'test_civic_harvester.json' + fn = "test_civic_harvester.json" assert harvester.harvest(filename=fn) file_path = APP_ROOT / "data" / "civic" / "harvester" / fn assert file_path.exists() @@ -139,7 +137,9 @@ def test_civic_evidence(harvested_evidence, civic_eid_3017): elif e["id"] == 6178: assert e["assertion_ids"] == [12, 7] checked.append(e["id"]) - assert len(checked) == 2, f"Expected to check CIViC Evidence Items 3017 and 6178, but only checked {checked}" # noqa: E501 + assert ( + len(checked) == 2 + ), f"Expected to check CIViC Evidence Items 3017 and 6178, but only checked {checked}" assert checked, "CIViC Evidence Item 3017 not in harvested evidence" diff --git a/tests/unit/setup/test_minimal_setup.py b/tests/unit/setup/test_minimal_setup.py index a665cf45..6b3ce292 100644 --- a/tests/unit/setup/test_minimal_setup.py +++ b/tests/unit/setup/test_minimal_setup.py @@ -1,4 +1,4 @@ -"""This module tests basic project setup.""" +"""Tests basic project setup.""" import sys diff --git a/tests/unit/test_query.py b/tests/unit/test_query.py index c6346959..2c3215df 100644 --- a/tests/unit/test_query.py +++ b/tests/unit/test_query.py @@ -1,7 +1,6 @@ """Test the MetaKB search method.""" -from metakb.schemas import SourceName import pytest - +from metakb.schemas import SourceName from metakb.version import __version__ # TODO: @@ -13,21 +12,21 @@ async def return_response(query_handler, statement_id, **kwargs): """Return the statement given ID if it exists.""" response = await query_handler.search(**kwargs) - statements = response['statements'] - propositions = response['propositions'] + statements = response["statements"] + propositions = response["propositions"] assert len(statements) != 0 assert len(propositions) != 0 - assert len(response['matches']['statements']) != 0 - assert len(response['matches']['propositions']) != 0 + assert len(response["matches"]["statements"]) != 0 + assert len(response["matches"]["propositions"]) != 0 s = None for statement in statements: - if statement['id'] == statement_id: + if statement["id"] == statement_id: s = statement break p = None for proposition in propositions: - if s['proposition'] == proposition['id']: + if s["proposition"] == proposition["id"]: p = proposition break return s, p @@ -35,40 +34,49 @@ async def return_response(query_handler, statement_id, **kwargs): def assert_no_match(response): """No match assertions for queried concepts in search.""" - assert response['statements'] == [] - assert response['propositions'] == [] - assert len(response['warnings']) > 0 + assert response["statements"] == [] + assert response["propositions"] == [] + assert len(response["warnings"]) > 0 def assert_no_match_id(response): """No match assertions for search by id.""" assert len(response.keys()) == 3 - assert len(response['warnings']) > 0 + assert len(response["warnings"]) > 0 def assert_keys_for_detail_false(response_keys): """Check that keys aren't in response when detail is false.""" - assert 'variation_descriptors' not in response_keys - assert 'gene_descriptors' not in response_keys - assert 'therapy_descriptors' not in response_keys - assert 'disease_descriptors' not in response_keys - assert 'methods' not in response_keys - assert 'documents' not in response_keys + assert "variation_descriptors" not in response_keys + assert "gene_descriptors" not in response_keys + assert "therapy_descriptors" not in response_keys + assert "disease_descriptors" not in response_keys + assert "methods" not in response_keys + assert "documents" not in response_keys -def assert_keys_for_detail_true(response_keys, response, is_evidence=True, - tr_response=True): +def assert_keys_for_detail_true( + response_keys, response, is_evidence=True, tr_response=True +): """Check that keys are in response when detail is false.""" - fields = ['variation_descriptors', 'gene_descriptors', - 'disease_descriptors', 'methods', - 'documents', 'statements', 'propositions'] + fields = [ + "variation_descriptors", + "gene_descriptors", + "disease_descriptors", + "methods", + "documents", + "statements", + "propositions", + ] if tr_response: - fields += ['therapy_descriptors'] + fields += ["therapy_descriptors"] for field in fields: assert field in response_keys if is_evidence: # Evidence only does not have supported_by with other statements - if field == "documents" and response["statements"][0]["id"].startswith(SourceName.ONCOKB): # noqa: E501 + if field == "documents" and response["statements"][0]["id"].startswith( + SourceName.ONCOKB + ): # OncoKB can have multiple documents, which differs from CIViC + MOA assert len(response[field]) > 0 else: @@ -77,36 +85,43 @@ def assert_keys_for_detail_true(response_keys, response, is_evidence=True, assert len(response[field]) > 1, field -def assert_response_items(response, statement, proposition, - variation_descriptor, gene_descriptor, - disease_descriptor, method, - document, therapy_descriptor, - check_statement, check_proposition, - check_variation_descriptor, - check_descriptor, check_method, check_document - ): +def assert_response_items( + response, + statement, + proposition, + variation_descriptor, + gene_descriptor, + disease_descriptor, + method, + document, + therapy_descriptor, + check_statement, + check_proposition, + check_variation_descriptor, + check_descriptor, + check_method, + check_document, +): """Check that search response match expected values.""" if therapy_descriptor: assert_keys_for_detail_true(response.keys(), response) else: - assert_keys_for_detail_true(response.keys(), response, - tr_response=False) + assert_keys_for_detail_true(response.keys(), response, tr_response=False) - response_statement = response['statements'][0] - response_proposition = response['propositions'][0] - response_variation_descriptor = response['variation_descriptors'][0] - response_gene_descriptor = response['gene_descriptors'][0] + response_statement = response["statements"][0] + response_proposition = response["propositions"][0] + response_variation_descriptor = response["variation_descriptors"][0] + response_gene_descriptor = response["gene_descriptors"][0] if therapy_descriptor: - response_therapy_descriptor = response['therapy_descriptors'][0] + response_therapy_descriptor = response["therapy_descriptors"][0] else: response_therapy_descriptor = None - response_disease_descriptor = response['disease_descriptors'][0] - response_method = response['methods'][0] + response_disease_descriptor = response["disease_descriptors"][0] + response_method = response["methods"][0] check_statement(response_statement, statement) check_proposition(response_proposition, proposition) - check_variation_descriptor(response_variation_descriptor, - variation_descriptor) + check_variation_descriptor(response_variation_descriptor, variation_descriptor) check_descriptor(gene_descriptor, response_gene_descriptor) check_descriptor(disease_descriptor, response_disease_descriptor) if therapy_descriptor: @@ -124,58 +139,57 @@ def assert_response_items(response, statement, proposition, check_document(actual_d, test_d) assert doc_ids == [], "Document IDs were all not checked" else: - response_document = response['documents'][0] + response_document = response["documents"][0] check_document(response_document, document) # Assert that IDs match in response items - assert response_statement['proposition'] == response_proposition['id'] - assert response_statement['variation_descriptor'] == \ - response_variation_descriptor['id'] + assert response_statement["proposition"] == response_proposition["id"] + assert ( + response_statement["variation_descriptor"] + == response_variation_descriptor["id"] + ) if therapy_descriptor: - assert response_statement['therapy_descriptor'] == \ - response_therapy_descriptor['id'] - assert response_statement['disease_descriptor'] == \ - response_disease_descriptor['id'] - assert response_statement['method'] == response_method['id'] + assert ( + response_statement["therapy_descriptor"] + == response_therapy_descriptor["id"] + ) + assert response_statement["disease_descriptor"] == response_disease_descriptor["id"] + assert response_statement["method"] == response_method["id"] if response_statement["id"].startswith(SourceName.ONCOKB): - supported_by = response_statement['supported_by'] + supported_by = response_statement["supported_by"] documents_ids = [d["id"] for d in response["documents"]] assert set(supported_by) == set(documents_ids) else: - assert response_statement['supported_by'][0] == response_document['id'] + assert response_statement["supported_by"][0] == response_document["id"] - assert proposition['subject'] == \ - response_variation_descriptor['variation_id'] - assert proposition['object_qualifier'] == \ - response_disease_descriptor['disease_id'] + assert proposition["subject"] == response_variation_descriptor["variation_id"] + assert proposition["object_qualifier"] == response_disease_descriptor["disease_id"] if therapy_descriptor: - assert proposition['object'] == \ - response_therapy_descriptor['therapy_id'] + assert proposition["object"] == response_therapy_descriptor["therapy_id"] - assert response_variation_descriptor['gene_context'] == \ - response_gene_descriptor['id'] + assert ( + response_variation_descriptor["gene_context"] == response_gene_descriptor["id"] + ) def assert_general_search_queries(response): """Check for general search queries.""" - assert response['matches'] - len_statement_matches = len(response['matches']['statements']) + assert response["matches"] + len_statement_matches = len(response["matches"]["statements"]) assert len_statement_matches > 0 - assert len(response['matches']['propositions']) > 0 - len_statements = len(response['statements']) + assert len(response["matches"]["propositions"]) > 0 + len_statements = len(response["statements"]) assert len_statements > 0 assert len_statement_matches == len_statements - assert len(response['propositions']) > 0 - assert len(response['methods']) > 0 - assert len(response['documents']) > 0 + assert len(response["propositions"]) > 0 + assert len(response["methods"]) > 0 + assert len(response["documents"]) > 0 def test_search_id(query_handler): """Test that search id method works correctly.""" - resp = query_handler.search_by_id( - "proposition:xsTCVDo1bo2P_6Sext0Y3ibU3MPbiyXE" - ) + resp = query_handler.search_by_id("proposition:xsTCVDo1bo2P_6Sext0Y3ibU3MPbiyXE") assert resp["proposition"] assert not resp["warnings"] assert query_handler.search_by_id("proposition:001")["warnings"] @@ -186,118 +200,132 @@ def test_search_id(query_handler): @pytest.mark.asyncio async def test_general_search_queries(query_handler): """Test that queries do not return errors.""" - response = await query_handler.search(variation='braf v600e', detail=True) + response = await query_handler.search(variation="braf v600e", detail=True) assert_general_search_queries(response) - response = await query_handler.search(variation='egfr l858r', detail=True) + response = await query_handler.search(variation="egfr l858r", detail=True) assert_general_search_queries(response) - response = await query_handler.search(disease='cancer', detail=True) + response = await query_handler.search(disease="cancer", detail=True) assert_general_search_queries(response) @pytest.mark.asyncio -async def test_civic_eid2997(query_handler, civic_eid2997_statement, - civic_eid2997_proposition, check_statement, - check_proposition): +async def test_civic_eid2997( + query_handler, + civic_eid2997_statement, + civic_eid2997_proposition, + check_statement, + check_proposition, +): """Test search on CIViC Evidence Item 2997.""" - statement_id = 'civic.eid:2997' + statement_id = "civic.eid:2997" # Test search by Subject - s, p = await return_response(query_handler, statement_id, - variation='ga4gh:VA.kgjrhgf84CEndyLjKdAO0RxN-e3pJjxA') + s, p = await return_response( + query_handler, + statement_id, + variation="ga4gh:VA.kgjrhgf84CEndyLjKdAO0RxN-e3pJjxA", + ) check_statement(s, civic_eid2997_statement) check_proposition(p, civic_eid2997_proposition) # Test search by Object - s, p = await return_response(query_handler, statement_id, therapy='rxcui:1430438') + s, p = await return_response(query_handler, statement_id, therapy="rxcui:1430438") check_statement(s, civic_eid2997_statement) check_proposition(p, civic_eid2997_proposition) # Test search by Object Qualifier - s, p = await return_response(query_handler, statement_id, disease='ncit:C2926') + s, p = await return_response(query_handler, statement_id, disease="ncit:C2926") check_statement(s, civic_eid2997_statement) check_proposition(p, civic_eid2997_proposition) # Test search by Gene Descriptor # HGNC ID - s, p = await return_response(query_handler, statement_id, gene='hgnc:3236') + s, p = await return_response(query_handler, statement_id, gene="hgnc:3236") check_statement(s, civic_eid2997_statement) check_proposition(p, civic_eid2997_proposition) # Label - s, p = await return_response(query_handler, statement_id, gene='EGFR') + s, p = await return_response(query_handler, statement_id, gene="EGFR") check_statement(s, civic_eid2997_statement) check_proposition(p, civic_eid2997_proposition) # Alt label - s, p = await return_response(query_handler, statement_id, gene='ERBB1') + s, p = await return_response(query_handler, statement_id, gene="ERBB1") check_statement(s, civic_eid2997_statement) check_proposition(p, civic_eid2997_proposition) # Test search by Variation Descriptor # Gene Symbol + Variant Name - s, p = await return_response(query_handler, statement_id, variation='EGFR L858R') + s, p = await return_response(query_handler, statement_id, variation="EGFR L858R") check_statement(s, civic_eid2997_statement) check_proposition(p, civic_eid2997_proposition) # Alt Label - s, p = await return_response(query_handler, statement_id, - variation='egfr Leu858ARG') + s, p = await return_response( + query_handler, statement_id, variation="egfr Leu858ARG" + ) check_statement(s, civic_eid2997_statement) check_proposition(p, civic_eid2997_proposition) # HGVS Expression - s, p = await return_response(query_handler, statement_id, - variation='NP_005219.2:p.Leu858Arg') + s, p = await return_response( + query_handler, statement_id, variation="NP_005219.2:p.Leu858Arg" + ) check_statement(s, civic_eid2997_statement) check_proposition(p, civic_eid2997_proposition) # Test search by Therapy Descriptor # Label - s, p = await return_response(query_handler, statement_id, therapy='Afatinib') + s, p = await return_response(query_handler, statement_id, therapy="Afatinib") check_statement(s, civic_eid2997_statement) check_proposition(p, civic_eid2997_proposition) # Alt Label - s, p = await return_response(query_handler, statement_id, therapy='BIBW2992') + s, p = await return_response(query_handler, statement_id, therapy="BIBW2992") check_statement(s, civic_eid2997_statement) check_proposition(p, civic_eid2997_proposition) # Test search by Disease Descriptor # Label - s, p = await return_response(query_handler, statement_id, - disease='Lung Non-small Cell Carcinoma') + s, p = await return_response( + query_handler, statement_id, disease="Lung Non-small Cell Carcinoma" + ) check_statement(s, civic_eid2997_statement) check_proposition(p, civic_eid2997_proposition) @pytest.mark.asyncio -async def test_civic_eid1409_statement(query_handler, civic_eid1409_statement, - check_statement): +async def test_civic_eid1409_statement( + query_handler, civic_eid1409_statement, check_statement +): """Test search on CIViC Evidence Item 1409.""" - statement_id = 'civic.eid:1409' + statement_id = "civic.eid:1409" # Test search by Subject - s, p = await return_response(query_handler, statement_id, - variation='ga4gh:VA.ZDdoQdURgO2Daj2NxLj4pcDnjiiAsfbO') + s, p = await return_response( + query_handler, + statement_id, + variation="ga4gh:VA.ZDdoQdURgO2Daj2NxLj4pcDnjiiAsfbO", + ) check_statement(s, civic_eid1409_statement) # Test search by Object - s, p = await return_response(query_handler, statement_id, therapy='ncit:C64768') + s, p = await return_response(query_handler, statement_id, therapy="ncit:C64768") check_statement(s, civic_eid1409_statement) # Test search by Object Qualifier - s, p = await return_response(query_handler, statement_id, disease='ncit:C3510') + s, p = await return_response(query_handler, statement_id, disease="ncit:C3510") check_statement(s, civic_eid1409_statement) # Test search by Gene Descriptor # HGNC ID - s, p = await return_response(query_handler, statement_id, gene='hgnc:1097') + s, p = await return_response(query_handler, statement_id, gene="hgnc:1097") check_statement(s, civic_eid1409_statement) # Label - s, p = await return_response(query_handler, statement_id, gene='BRAF') + s, p = await return_response(query_handler, statement_id, gene="BRAF") check_statement(s, civic_eid1409_statement) # TODO: Not found in gene normalizer @@ -308,91 +336,100 @@ async def test_civic_eid1409_statement(query_handler, civic_eid1409_statement, # Test search by Variation Descriptor # Gene Symbol + Variant Name - s, p = await return_response(query_handler, statement_id, variation='BRAF V600E') + s, p = await return_response(query_handler, statement_id, variation="BRAF V600E") check_statement(s, civic_eid1409_statement) # # Alt Label - s, p = await return_response(query_handler, statement_id, - variation='braf val600glu') + s, p = await return_response( + query_handler, statement_id, variation="braf val600glu" + ) check_statement(s, civic_eid1409_statement) # HGVS Expression - s, p = await return_response(query_handler, statement_id, - variation='NP_004324.2:p.Val600Glu') + s, p = await return_response( + query_handler, statement_id, variation="NP_004324.2:p.Val600Glu" + ) check_statement(s, civic_eid1409_statement) # Test search by Therapy Descriptor # Label - s, p = await return_response(query_handler, statement_id, therapy='Vemurafenib') + s, p = await return_response(query_handler, statement_id, therapy="Vemurafenib") check_statement(s, civic_eid1409_statement) # # Alt Label - s, p = await return_response(query_handler, statement_id, - therapy='BRAF(V600E) Kinase Inhibitor RO5185426') + s, p = await return_response( + query_handler, statement_id, therapy="BRAF(V600E) Kinase Inhibitor RO5185426" + ) check_statement(s, civic_eid1409_statement) # Label - s, p = await return_response(query_handler, statement_id, disease='Skin Melanoma') + s, p = await return_response(query_handler, statement_id, disease="Skin Melanoma") check_statement(s, civic_eid1409_statement) @pytest.mark.asyncio async def test_civic_aid6(query_handler, civic_aid6_statement, check_statement): """Test search on CIViC Evidence Item 6.""" - statement_id = 'civic.aid:6' + statement_id = "civic.aid:6" # Test search by Subject - s, p = await return_response(query_handler, statement_id, - variation='ga4gh:VA.kgjrhgf84CEndyLjKdAO0RxN-e3pJjxA') + s, p = await return_response( + query_handler, + statement_id, + variation="ga4gh:VA.kgjrhgf84CEndyLjKdAO0RxN-e3pJjxA", + ) check_statement(s, civic_aid6_statement) # Test search by Object - s, p = await return_response(query_handler, statement_id, therapy='rxcui:1430438') + s, p = await return_response(query_handler, statement_id, therapy="rxcui:1430438") check_statement(s, civic_aid6_statement) # Test search by Object Qualifier - s, p = await return_response(query_handler, statement_id, disease='ncit:C2926') + s, p = await return_response(query_handler, statement_id, disease="ncit:C2926") check_statement(s, civic_aid6_statement) # Test search by Gene Descriptor # HGNC ID - s, p = await return_response(query_handler, statement_id, gene='hgnc:3236') + s, p = await return_response(query_handler, statement_id, gene="hgnc:3236") check_statement(s, civic_aid6_statement) # Label - s, p = await return_response(query_handler, statement_id, gene='EGFR') + s, p = await return_response(query_handler, statement_id, gene="EGFR") check_statement(s, civic_aid6_statement) # Alt label - s, p = await return_response(query_handler, statement_id, gene='ERBB1') + s, p = await return_response(query_handler, statement_id, gene="ERBB1") check_statement(s, civic_aid6_statement) # Test search by Variation Descriptor # Gene Symbol + Variant Name - s, p = await return_response(query_handler, statement_id, variation='EGFR L858R') + s, p = await return_response(query_handler, statement_id, variation="EGFR L858R") check_statement(s, civic_aid6_statement) # Alt Label - s, p = await return_response(query_handler, statement_id, - variation='egfr leu858arg') + s, p = await return_response( + query_handler, statement_id, variation="egfr leu858arg" + ) check_statement(s, civic_aid6_statement) # HGVS Expression - s, p = await return_response(query_handler, statement_id, - variation='NP_005219.2:p.leu858arg') + s, p = await return_response( + query_handler, statement_id, variation="NP_005219.2:p.leu858arg" + ) check_statement(s, civic_aid6_statement) # Label - s, p = await return_response(query_handler, statement_id, therapy='afatinib') + s, p = await return_response(query_handler, statement_id, therapy="afatinib") check_statement(s, civic_aid6_statement) # Alt Label - s, p = await return_response(query_handler, statement_id, therapy='BIBW 2992') + s, p = await return_response(query_handler, statement_id, therapy="BIBW 2992") check_statement(s, civic_aid6_statement) # Label - s, p = await return_response(query_handler, statement_id, - disease='Lung Non-small Cell Carcinoma ') + s, p = await return_response( + query_handler, statement_id, disease="Lung Non-small Cell Carcinoma " + ) check_statement(s, civic_aid6_statement) @@ -400,224 +437,340 @@ async def test_civic_aid6(query_handler, civic_aid6_statement, check_statement): async def test_multiple_parameters(query_handler): """Test that multiple parameter searches work correctly.""" # Test no match - response = await query_handler.search(variation=' braf v600e', gene='egfr', - disease='cancer', therapy='cisplatin') + response = await query_handler.search( + variation=" braf v600e", gene="egfr", disease="cancer", therapy="cisplatin" + ) assert_no_match(response) - response = await query_handler.search(therapy='cisplatin', disease='4dfadfafas') + response = await query_handler.search(therapy="cisplatin", disease="4dfadfafas") assert_no_match(response) # Test EID2997 queries - object_qualifier = 'ncit:C2926' - subject = 'ga4gh:VA.kgjrhgf84CEndyLjKdAO0RxN-e3pJjxA' - object = 'rxcui:1430438' + object_qualifier = "ncit:C2926" + subject = "ga4gh:VA.kgjrhgf84CEndyLjKdAO0RxN-e3pJjxA" + object = "rxcui:1430438" response = await query_handler.search( - variation='NP_005219.2:p.Leu858Arg', - disease='NSCLC', - therapy='Afatinib' + variation="NP_005219.2:p.Leu858Arg", disease="NSCLC", therapy="Afatinib" ) - for p in response['propositions']: - if p['id'] in response['matches']['propositions']: - assert p['object_qualifier'] == object_qualifier - assert p['subject'] == subject - assert p['object'] == object + for p in response["propositions"]: + if p["id"] in response["matches"]["propositions"]: + assert p["object_qualifier"] == object_qualifier + assert p["subject"] == subject + assert p["object"] == object # Wrong gene response = await query_handler.search( - variation='NP_005219.2:p.Leu858Arg', - disease='NSCLC', - therapy='Afatinib', - gene='braf' + variation="NP_005219.2:p.Leu858Arg", + disease="NSCLC", + therapy="Afatinib", + gene="braf", ) assert_no_match(response) # Test eid1409 queries - object_qualifier = 'ncit:C3510' - subject = 'ga4gh:VA.ZDdoQdURgO2Daj2NxLj4pcDnjiiAsfbO' + object_qualifier = "ncit:C3510" + subject = "ga4gh:VA.ZDdoQdURgO2Daj2NxLj4pcDnjiiAsfbO" response = await query_handler.search( - variation=subject, - disease='malignant trunk melanoma' + variation=subject, disease="malignant trunk melanoma" ) - for p in response['propositions']: - if p['id'] in response['matches']['propositions']: - assert p['object_qualifier'] == object_qualifier - assert p['subject'] == subject - assert p['object'] + for p in response["propositions"]: + if p["id"] in response["matches"]["propositions"]: + assert p["object_qualifier"] == object_qualifier + assert p["subject"] == subject + assert p["object"] # No Match for statement ID response = await query_handler.search( variation=subject, - disease='malignant trunk melanoma', - statement_id='civic.eid:2997' + disease="malignant trunk melanoma", + statement_id="civic.eid:2997", ) assert_no_match(response) # CIViC EID2997 response = await query_handler.search( - statement_id='civiC.eid:2997', - variation='ga4gh:VA.kgjrhgf84CEndyLjKdAO0RxN-e3pJjxA' + statement_id="civiC.eid:2997", + variation="ga4gh:VA.kgjrhgf84CEndyLjKdAO0RxN-e3pJjxA", ) - assert len(response['statements']) == 1 - assert len(response['propositions']) == 1 - assert len(response['matches']['statements']) == 1 - assert len(response['matches']['propositions']) == 1 + assert len(response["statements"]) == 1 + assert len(response["propositions"]) == 1 + assert len(response["matches"]["statements"]) == 1 + assert len(response["matches"]["propositions"]) == 1 # CIViC AID6 response = await query_handler.search( - statement_id='CIViC.AID:6', - variation='ga4gh:VA.kgjrhgf84CEndyLjKdAO0RxN-e3pJjxA', - disease='ncit:C2926' + statement_id="CIViC.AID:6", + variation="ga4gh:VA.kgjrhgf84CEndyLjKdAO0RxN-e3pJjxA", + disease="ncit:C2926", ) - assert len(response['statements']) > 1 - assert len(response['propositions']) > 1 - assert len(response['matches']['statements']) == 1 - assert len(response['matches']['propositions']) == 1 + assert len(response["statements"]) > 1 + assert len(response["propositions"]) > 1 + assert len(response["matches"]["statements"]) == 1 + assert len(response["matches"]["propositions"]) == 1 civic_aid6_supported_by_statements = list() - for s in response['statements']: - if s['id'] == 'civic.aid:6': + for s in response["statements"]: + if s["id"] == "civic.aid:6": statement = s else: - civic_aid6_supported_by_statements.append(s['id']) - supported_by_statements = [s for s in statement['supported_by'] if - s.startswith('civic.eid:')] - assert set(civic_aid6_supported_by_statements) == \ - set(supported_by_statements) + civic_aid6_supported_by_statements.append(s["id"]) + supported_by_statements = [ + s for s in statement["supported_by"] if s.startswith("civic.eid:") + ] + assert set(civic_aid6_supported_by_statements) == set(supported_by_statements) response = await query_handler.search( - disease='ncit:C2926', - variation='ga4gh:VA.kgjrhgf84CEndyLjKdAO0RxN-e3pJjxA' + disease="ncit:C2926", variation="ga4gh:VA.kgjrhgf84CEndyLjKdAO0RxN-e3pJjxA" ) statement_ids = list() - for s in response['statements']: - if s['id'] == 'civic.aid:6': + for s in response["statements"]: + if s["id"] == "civic.aid:6": pass else: - statement_ids.append(s['id']) + statement_ids.append(s["id"]) for aid6_statement in civic_aid6_supported_by_statements: assert aid6_statement in statement_ids - assert len(response['matches']['statements']) > 1 - assert len(response['matches']['propositions']) > 1 + assert len(response["matches"]["statements"]) > 1 + assert len(response["matches"]["propositions"]) > 1 @pytest.mark.asyncio async def test_civic_detail_flag_therapeutic( - query_handler, civic_eid2997_statement, civic_eid2997_proposition, civic_vid33, - civic_gid19, civic_did8, method1, pmid_23982599, civic_tid146, check_statement, - check_proposition, check_variation_descriptor, check_descriptor, check_method, - check_document + query_handler, + civic_eid2997_statement, + civic_eid2997_proposition, + civic_vid33, + civic_gid19, + civic_did8, + method1, + pmid_23982599, + civic_tid146, + check_statement, + check_proposition, + check_variation_descriptor, + check_descriptor, + check_method, + check_document, ): """Test that detail flag works correctly for CIViC Therapeutic Response.""" - response = await query_handler.search(statement_id='civic.eid:2997', detail=False) + response = await query_handler.search(statement_id="civic.eid:2997", detail=False) assert_keys_for_detail_false(response.keys()) - response = await query_handler.search(statement_id='civic.eid:2997', detail=True) + response = await query_handler.search(statement_id="civic.eid:2997", detail=True) assert_keys_for_detail_true(response.keys(), response) - assert_response_items(response, civic_eid2997_statement, - civic_eid2997_proposition, - civic_vid33, civic_gid19, civic_did8, - method1, pmid_23982599, civic_tid146, - check_statement, check_proposition, - check_variation_descriptor, - check_descriptor, check_method, check_document - ) + assert_response_items( + response, + civic_eid2997_statement, + civic_eid2997_proposition, + civic_vid33, + civic_gid19, + civic_did8, + method1, + pmid_23982599, + civic_tid146, + check_statement, + check_proposition, + check_variation_descriptor, + check_descriptor, + check_method, + check_document, + ) @pytest.mark.asyncio async def test_civic_detail_flag_diagnostic( - query_handler, civic_eid2_statement, civic_eid2_proposition, civic_vid99, - civic_did2, civic_gid38, method1, pmid_15146165, check_statement, check_proposition, - check_variation_descriptor, check_descriptor, check_method, check_document + query_handler, + civic_eid2_statement, + civic_eid2_proposition, + civic_vid99, + civic_did2, + civic_gid38, + method1, + pmid_15146165, + check_statement, + check_proposition, + check_variation_descriptor, + check_descriptor, + check_method, + check_document, ): """Test that detail flag works correctly for CIViC Diagnostic Response.""" - response = await query_handler.search(statement_id='civic.eid:2', detail=False) + response = await query_handler.search(statement_id="civic.eid:2", detail=False) assert_keys_for_detail_false(response.keys()) - response = await query_handler.search(statement_id='civic.eid:2', detail=True) + response = await query_handler.search(statement_id="civic.eid:2", detail=True) assert_keys_for_detail_true(response.keys(), response, tr_response=False) - assert_response_items(response, civic_eid2_statement, - civic_eid2_proposition, - civic_vid99, civic_gid38, civic_did2, - method1, pmid_15146165, None, check_statement, - check_proposition, check_variation_descriptor, - check_descriptor, check_method, check_document) + assert_response_items( + response, + civic_eid2_statement, + civic_eid2_proposition, + civic_vid99, + civic_gid38, + civic_did2, + method1, + pmid_15146165, + None, + check_statement, + check_proposition, + check_variation_descriptor, + check_descriptor, + check_method, + check_document, + ) @pytest.mark.asyncio async def test_civic_detail_flag_prognostic( - query_handler, civic_eid26_statement, civic_eid26_proposition, civic_vid65, - civic_did3, civic_gid29, method1, pmid_16384925, check_statement, check_proposition, - check_variation_descriptor, check_descriptor, check_method, check_document + query_handler, + civic_eid26_statement, + civic_eid26_proposition, + civic_vid65, + civic_did3, + civic_gid29, + method1, + pmid_16384925, + check_statement, + check_proposition, + check_variation_descriptor, + check_descriptor, + check_method, + check_document, ): """Test that detail flag works correctly for CIViC Prognostic Response.""" - response = await query_handler.search(statement_id='civic.eid:26', detail=False) + response = await query_handler.search(statement_id="civic.eid:26", detail=False) assert_keys_for_detail_false(response.keys()) - response = await query_handler.search(statement_id='civic.eid:26', detail=True) + response = await query_handler.search(statement_id="civic.eid:26", detail=True) assert_keys_for_detail_true(response.keys(), response, tr_response=False) - assert_response_items(response, civic_eid26_statement, - civic_eid26_proposition, - civic_vid65, civic_gid29, civic_did3, - method1, pmid_16384925, None, check_statement, - check_proposition, check_variation_descriptor, - check_descriptor, check_method, check_document) + assert_response_items( + response, + civic_eid26_statement, + civic_eid26_proposition, + civic_vid65, + civic_gid29, + civic_did3, + method1, + pmid_16384925, + None, + check_statement, + check_proposition, + check_variation_descriptor, + check_descriptor, + check_method, + check_document, + ) @pytest.mark.asyncio async def test_moa_detail_flag( - query_handler, moa_aid71_statement, moa_aid71_proposition, moa_vid71, moa_abl1, - moa_imatinib, moa_chronic_myelogenous_leukemia, method4, pmid_11423618, - check_statement, check_proposition, check_variation_descriptor, check_descriptor, - check_method, check_document + query_handler, + moa_aid71_statement, + moa_aid71_proposition, + moa_vid71, + moa_abl1, + moa_imatinib, + moa_chronic_myelogenous_leukemia, + method4, + pmid_11423618, + check_statement, + check_proposition, + check_variation_descriptor, + check_descriptor, + check_method, + check_document, ): """Test that detail flag works correctly for MOA.""" - response = await query_handler.search(statement_id='moa.assertion:71', detail=False) + response = await query_handler.search(statement_id="moa.assertion:71", detail=False) assert_keys_for_detail_false(response.keys()) - response = await query_handler.search(statement_id='moa.assertion:71', detail=True) + response = await query_handler.search(statement_id="moa.assertion:71", detail=True) assert_keys_for_detail_true(response.keys(), response) - assert_response_items(response, moa_aid71_statement, moa_aid71_proposition, - moa_vid71, moa_abl1, - moa_chronic_myelogenous_leukemia, method4, - pmid_11423618, moa_imatinib, check_statement, - check_proposition, check_variation_descriptor, - check_descriptor, check_method, check_document) + assert_response_items( + response, + moa_aid71_statement, + moa_aid71_proposition, + moa_vid71, + moa_abl1, + moa_chronic_myelogenous_leukemia, + method4, + pmid_11423618, + moa_imatinib, + check_statement, + check_proposition, + check_variation_descriptor, + check_descriptor, + check_method, + check_document, + ) @pytest.mark.asyncio async def test_oncokb_braf_v600e( - query_handler, oncokb_diagnostic_statement1, - oncokb_diagnostic_proposition1, oncokb_therapeutic_statement1, - oncokb_therapeutic_proposition1, oncokb_braf_v600e_vd, oncokb_braf_gene_descriptor, - oncokb_trametinib_therapy_descriptor, oncokb_ecd_disease_descriptor, - oncokb_mel_disease_descriptor, oncokb_method, oncokb_diagnostic1_documents, - oncokb_therapeutic1_documents_query, check_statement, check_proposition, - check_variation_descriptor, check_descriptor, check_method, check_document + query_handler, + oncokb_diagnostic_statement1, + oncokb_diagnostic_proposition1, + oncokb_therapeutic_statement1, + oncokb_therapeutic_proposition1, + oncokb_braf_v600e_vd, + oncokb_braf_gene_descriptor, + oncokb_trametinib_therapy_descriptor, + oncokb_ecd_disease_descriptor, + oncokb_mel_disease_descriptor, + oncokb_method, + oncokb_diagnostic1_documents, + oncokb_therapeutic1_documents_query, + check_statement, + check_proposition, + check_variation_descriptor, + check_descriptor, + check_method, + check_document, ): """Test that OncoKB queries work as expected""" statement_id = oncokb_diagnostic_statement1["id"] - response = await query_handler.search(statement_id=statement_id, - variation="BRAF V600E", detail=True) + response = await query_handler.search( + statement_id=statement_id, variation="BRAF V600E", detail=True + ) assert_keys_for_detail_true(response.keys(), response, tr_response=False) assert_response_items( - response, oncokb_diagnostic_statement1, oncokb_diagnostic_proposition1, - oncokb_braf_v600e_vd, oncokb_braf_gene_descriptor, - oncokb_ecd_disease_descriptor, oncokb_method, oncokb_diagnostic1_documents, - None, check_statement, check_proposition, check_variation_descriptor, - check_descriptor, check_method, check_document + response, + oncokb_diagnostic_statement1, + oncokb_diagnostic_proposition1, + oncokb_braf_v600e_vd, + oncokb_braf_gene_descriptor, + oncokb_ecd_disease_descriptor, + oncokb_method, + oncokb_diagnostic1_documents, + None, + check_statement, + check_proposition, + check_variation_descriptor, + check_descriptor, + check_method, + check_document, ) statement_id = oncokb_therapeutic_statement1["id"] - response = await query_handler.search(statement_id=statement_id, - variation="BRAF V600E", detail=True) + response = await query_handler.search( + statement_id=statement_id, variation="BRAF V600E", detail=True + ) assert_keys_for_detail_true(response.keys(), response, tr_response=True) assert_response_items( - response, oncokb_therapeutic_statement1, oncokb_therapeutic_proposition1, - oncokb_braf_v600e_vd, oncokb_braf_gene_descriptor, - oncokb_mel_disease_descriptor, oncokb_method, - oncokb_therapeutic1_documents_query, oncokb_trametinib_therapy_descriptor, - check_statement, check_proposition, check_variation_descriptor, - check_descriptor, check_method, check_document + response, + oncokb_therapeutic_statement1, + oncokb_therapeutic_proposition1, + oncokb_braf_v600e_vd, + oncokb_braf_gene_descriptor, + oncokb_mel_disease_descriptor, + oncokb_method, + oncokb_therapeutic1_documents_query, + oncokb_trametinib_therapy_descriptor, + check_statement, + check_proposition, + check_variation_descriptor, + check_descriptor, + check_method, + check_document, ) @@ -625,111 +778,130 @@ async def test_oncokb_braf_v600e( async def test_no_matches(query_handler): """Test invalid query matches.""" # GA instead of VA - response = await query_handler.search('ga4gh:GA.WyOqFMhc8aOnMFgdY0uM7nSLNqxVPAiR') + response = await query_handler.search("ga4gh:GA.WyOqFMhc8aOnMFgdY0uM7nSLNqxVPAiR") assert_no_match(response) # Invalid ID - response = \ - await query_handler.search(disease='ncit:C292632425235321524352435623462') + response = await query_handler.search( + disease="ncit:C292632425235321524352435623462" + ) assert_no_match(response) # Empty query - response = await query_handler.search(disease='') + response = await query_handler.search(disease="") assert_no_match(response) - response = await query_handler.search(gene='', therapy='', variation='', disease='') + response = await query_handler.search(gene="", therapy="", variation="", disease="") assert_no_match(response) - assert response['warnings'] == ['No parameters were entered.'] + assert response["warnings"] == ["No parameters were entered."] # Invalid variation - response = await query_handler.search(variation='v600e') + response = await query_handler.search(variation="v600e") assert_no_match(response) - response = query_handler.search_by_id('') + response = query_handler.search_by_id("") assert_no_match_id(response) - response = query_handler.search_by_id(' ') + response = query_handler.search_by_id(" ") assert_no_match_id(response) - response = query_handler.search_by_id('aid6') + response = query_handler.search_by_id("aid6") assert_no_match_id(response) - response = query_handler.search_by_id('civc.assertion:6') + response = query_handler.search_by_id("civc.assertion:6") assert_no_match_id(response) -def test_civic_id_search(query_handler, civic_eid2997_statement, - civic_vid33, civic_gid19, civic_tid146, civic_did8, - pmid_23982599, method1, check_statement, - check_variation_descriptor, check_descriptor, - check_method, check_document): +def test_civic_id_search( + query_handler, + civic_eid2997_statement, + civic_vid33, + civic_gid19, + civic_tid146, + civic_did8, + pmid_23982599, + method1, + check_statement, + check_variation_descriptor, + check_descriptor, + check_method, + check_document, +): """Test search on civic node id""" - res = query_handler.search_by_id('civic.eid:2997') - check_statement(res['statement'], civic_eid2997_statement) - - res = query_handler.search_by_id('civic.vid:33') - check_variation_descriptor(res['variation_descriptor'], civic_vid33) - - res = query_handler.search_by_id('civic.gid:19') - check_descriptor(res['gene_descriptor'], civic_gid19) - - res = query_handler.search_by_id('civic.tid:146') - check_descriptor(res['therapy_descriptor'], civic_tid146) - - res = query_handler.search_by_id('civic.did:8') - check_descriptor(res['disease_descriptor'], civic_did8) - - res = query_handler.search_by_id('pmid:23982599') - check_document(res['document'], pmid_23982599) - - res = query_handler.search_by_id('method:1') - check_method(res['method'], method1) - - -def test_moa_id_search(query_handler, moa_aid71_statement, - moa_vid71, moa_abl1, moa_imatinib, - moa_chronic_myelogenous_leukemia, pmid_11423618, - method4, check_statement, check_variation_descriptor, - check_descriptor, check_method, check_document): + res = query_handler.search_by_id("civic.eid:2997") + check_statement(res["statement"], civic_eid2997_statement) + + res = query_handler.search_by_id("civic.vid:33") + check_variation_descriptor(res["variation_descriptor"], civic_vid33) + + res = query_handler.search_by_id("civic.gid:19") + check_descriptor(res["gene_descriptor"], civic_gid19) + + res = query_handler.search_by_id("civic.tid:146") + check_descriptor(res["therapy_descriptor"], civic_tid146) + + res = query_handler.search_by_id("civic.did:8") + check_descriptor(res["disease_descriptor"], civic_did8) + + res = query_handler.search_by_id("pmid:23982599") + check_document(res["document"], pmid_23982599) + + res = query_handler.search_by_id("method:1") + check_method(res["method"], method1) + + +def test_moa_id_search( + query_handler, + moa_aid71_statement, + moa_vid71, + moa_abl1, + moa_imatinib, + moa_chronic_myelogenous_leukemia, + pmid_11423618, + method4, + check_statement, + check_variation_descriptor, + check_descriptor, + check_method, + check_document, +): """Test search on moa node id""" - res = query_handler.search_by_id('moa.assertion:71') - check_statement(res['statement'], moa_aid71_statement) + res = query_handler.search_by_id("moa.assertion:71") + check_statement(res["statement"], moa_aid71_statement) - res = query_handler.search_by_id('moa.variant:71') - check_variation_descriptor(res['variation_descriptor'], moa_vid71) + res = query_handler.search_by_id("moa.variant:71") + check_variation_descriptor(res["variation_descriptor"], moa_vid71) - res = query_handler.search_by_id('moa.normalize.gene:ABL1') - check_descriptor(res['gene_descriptor'], moa_abl1) + res = query_handler.search_by_id("moa.normalize.gene:ABL1") + check_descriptor(res["gene_descriptor"], moa_abl1) - res = query_handler.search_by_id('moa.normalize.therapy:Imatinib') - check_descriptor(res['therapy_descriptor'], moa_imatinib) + res = query_handler.search_by_id("moa.normalize.therapy:Imatinib") + check_descriptor(res["therapy_descriptor"], moa_imatinib) - res = query_handler.search_by_id('moa.normalize.disease:oncotree%3ACML') - check_descriptor(res['disease_descriptor'], - moa_chronic_myelogenous_leukemia) + res = query_handler.search_by_id("moa.normalize.disease:oncotree%3ACML") + check_descriptor(res["disease_descriptor"], moa_chronic_myelogenous_leukemia) - res = query_handler.search_by_id('moa.normalize.disease:oncotree:CML') - check_descriptor(res['disease_descriptor'], - moa_chronic_myelogenous_leukemia) + res = query_handler.search_by_id("moa.normalize.disease:oncotree:CML") + check_descriptor(res["disease_descriptor"], moa_chronic_myelogenous_leukemia) - res = query_handler.search_by_id('pmid:11423618') - check_document(res['document'], pmid_11423618) + res = query_handler.search_by_id("pmid:11423618") + check_document(res["document"], pmid_11423618) - res = query_handler.search_by_id(' method:4 ') - check_method(res['method'], method4) + res = query_handler.search_by_id(" method:4 ") + check_method(res["method"], method4) @pytest.mark.asyncio async def test_service_meta(query_handler): """Test service meta in response""" + def check_service_meta(response): """Check service meta in response is correct""" assert "service_meta_" in response service_meta_ = response["service_meta_"] assert service_meta_["name"] == "metakb" assert service_meta_["version"] == __version__ - assert service_meta_["url"] == \ - "https://github.com/cancervariants/metakb" + assert service_meta_["url"] == "https://github.com/cancervariants/metakb" statement_id = "civic.eid:2997" resp = await query_handler.search(statement_id=statement_id) diff --git a/tests/unit/test_search_statements.py b/tests/unit/test_search_statements.py index 4b7550ce..b1c16d68 100644 --- a/tests/unit/test_search_statements.py +++ b/tests/unit/test_search_statements.py @@ -13,13 +13,19 @@ def civic_vid33_with_gene(civic_vid33, civic_gid19): @pytest.fixture(scope="module") -def civic_eid2997(civic_eid2997_proposition, civic_vid33_with_gene, - civic_tid146, civic_did8, method1, pmid_23982599): +def civic_eid2997( + civic_eid2997_proposition, + civic_vid33_with_gene, + civic_tid146, + civic_did8, + method1, + pmid_23982599, +): """Create test fixture for CIViC EID2997""" return { "id": "civic.eid:2997", "type": "Statement", - "description": "Afatinib, an irreversible inhibitor of the ErbB family of tyrosine kinases has been approved in the US for the first-line treatment of patients with metastatic non-small-cell lung cancer (NSCLC) who have tumours with EGFR exon 19 deletions or exon 21 (L858R) substitution mutations as detected by a US FDA-approved test", # noqa: E501 + "description": "Afatinib, an irreversible inhibitor of the ErbB family of tyrosine kinases has been approved in the US for the first-line treatment of patients with metastatic non-small-cell lung cancer (NSCLC) who have tumours with EGFR exon 19 deletions or exon 21 (L858R) substitution mutations as detected by a US FDA-approved test", "direction": "supports", "evidence_level": "civic.evidence_level:A", "proposition": civic_eid2997_proposition, @@ -28,17 +34,23 @@ def civic_eid2997(civic_eid2997_proposition, civic_vid33_with_gene, "therapy_descriptor": civic_tid146, "disease_descriptor": civic_did8, "method": method1, - "supported_by": [pmid_23982599] + "supported_by": [pmid_23982599], } @pytest.fixture(scope="module") -def civic_aid6(civic_eid2997_proposition, civic_vid33_with_gene, civic_tid146, - civic_did8, method2, civic_aid6_document): +def civic_aid6( + civic_eid2997_proposition, + civic_vid33_with_gene, + civic_tid146, + civic_did8, + method2, + civic_aid6_document, +): """Create test fixture for CIViC AID6""" return { "id": "civic.aid:6", - "description": "L858R is among the most common sensitizing EGFR mutations in NSCLC, and is assessed via DNA mutational analysis, including Sanger sequencing and next generation sequencing methods. Tyrosine kinase inhibitor afatinib is FDA approved as a first line systemic therapy in NSCLC with sensitizing EGFR mutation.", # noqa: E501 + "description": "L858R is among the most common sensitizing EGFR mutations in NSCLC, and is assessed via DNA mutational analysis, including Sanger sequencing and next generation sequencing methods. Tyrosine kinase inhibitor afatinib is FDA approved as a first line systemic therapy in NSCLC with sensitizing EGFR mutation.", "direction": "supports", "evidence_level": "amp_asco_cap_2017_level:1A", "proposition": civic_eid2997_proposition, @@ -48,12 +60,15 @@ def civic_aid6(civic_eid2997_proposition, civic_vid33_with_gene, civic_tid146, "disease_descriptor": civic_did8, "method": method2, "supported_by": [ - civic_aid6_document, "civic.eid:2997", - "civic.eid:2629", "civic.eid:982", - "civic.eid:968", "civic.eid:883", - "civic.eid:879" + civic_aid6_document, + "civic.eid:2997", + "civic.eid:2629", + "civic.eid:982", + "civic.eid:968", + "civic.eid:883", + "civic.eid:879", ], - "type": "Statement" + "type": "Statement", } @@ -66,14 +81,19 @@ def moa_vid71_with_gene(moa_vid71, moa_abl1): @pytest.fixture(scope="module") -def moa_aid71(moa_aid71_proposition, moa_vid71_with_gene, moa_imatinib, - moa_chronic_myelogenous_leukemia, method4, - pmid_11423618): +def moa_aid71( + moa_aid71_proposition, + moa_vid71_with_gene, + moa_imatinib, + moa_chronic_myelogenous_leukemia, + method4, + pmid_11423618, +): """Create test fixture for MOA Assertion 71""" return { "id": "moa.assertion:71", "type": "Statement", - "description": "T315I mutant ABL1 in p210 BCR-ABL cells resulted in retained high levels of phosphotyrosine at increasing concentrations of inhibitor STI-571, whereas wildtype appropriately received inhibition.", # noqa: E501 + "description": "T315I mutant ABL1 in p210 BCR-ABL cells resulted in retained high levels of phosphotyrosine at increasing concentrations of inhibitor STI-571, whereas wildtype appropriately received inhibition.", "evidence_level": "moa.evidence_level:Preclinical", "proposition": moa_aid71_proposition, "variation_origin": "somatic", @@ -81,15 +101,19 @@ def moa_aid71(moa_aid71_proposition, moa_vid71_with_gene, moa_imatinib, "therapy_descriptor": moa_imatinib, "disease_descriptor": moa_chronic_myelogenous_leukemia, "method": method4, - "supported_by": [pmid_11423618] + "supported_by": [pmid_11423618], } @pytest.fixture(scope="module") def oncokb_diagnostic1( - oncokb_diagnostic_statement1, oncokb_diagnostic_proposition1, oncokb_braf_v600e_vd, - oncokb_braf_gene_descriptor, oncokb_ecd_disease_descriptor, oncokb_method, - oncokb_diagnostic1_documents + oncokb_diagnostic_statement1, + oncokb_diagnostic_proposition1, + oncokb_braf_v600e_vd, + oncokb_braf_gene_descriptor, + oncokb_ecd_disease_descriptor, + oncokb_method, + oncokb_diagnostic1_documents, ): """Create test fixture for OncoKB Diagnostic evidence for BRAF V600E""" vd = copy.deepcopy(oncokb_braf_v600e_vd) @@ -102,16 +126,20 @@ def oncokb_diagnostic1( "variation_descriptor": vd, "disease_descriptor": oncokb_ecd_disease_descriptor, "method": oncokb_method, - "supported_by": oncokb_diagnostic1_documents + "supported_by": oncokb_diagnostic1_documents, } @pytest.fixture(scope="module") def oncokb_therapeutic1( - oncokb_therapeutic_statement1, oncokb_therapeutic_proposition1, - oncokb_braf_v600e_vd, oncokb_braf_gene_descriptor, oncokb_mel_disease_descriptor, - oncokb_trametinib_therapy_descriptor, oncokb_method, - oncokb_therapeutic1_documents_query + oncokb_therapeutic_statement1, + oncokb_therapeutic_proposition1, + oncokb_braf_v600e_vd, + oncokb_braf_gene_descriptor, + oncokb_mel_disease_descriptor, + oncokb_trametinib_therapy_descriptor, + oncokb_method, + oncokb_therapeutic1_documents_query, ): """Create test fixture for OncoKB Therapeutic evidence for BRAF V600E""" vd = copy.deepcopy(oncokb_braf_v600e_vd) @@ -119,14 +147,14 @@ def oncokb_therapeutic1( return { "id": oncokb_therapeutic_statement1["id"], "type": "Statement", - "description": "Trametinib is an oral small molecule inhibitor of MEK1/2 that is FDA-approved alone or with dabrafenib for the treatment of patients with metastatic melanoma harboring a V600E or V600K BRAF mutation. In an open-label, randomized Phase III trial, patients with BRAF V600E/K-mutated unresectable, metastatic melanoma received oral trametinib (2 mg once daily) or an intravenous regimen of either dacarbazine (1000 mg/m2) or paclitaxel (175 mg/m2) every three weeks. Trametinib demonstrated improved progression-free survival (HR for disease progression or death = 0.45) and six-month overall survival (81% vs. 67%; death HR = 0.54; p=0.01) (PMID: 22663011). However, like other MEK inhibitors, the benefit of trametinib is limited by adverse reactions, most notably grade three or four rash and diarrhea (PMID: 22663011). Trametinib is not typically used as monotherapy for patients with BRAF V600K melanoma given its lower response rate compared to BRAF inhibitors and combined BRAF and MEK inhibitors. Patients previously treated with a RAF inhibitor appear to be less likely than untreated patients to respond to trametinib treatment (PMID: 22663011), and FDA guidelines state that trametinib as a monotherapy is not indicated for these patients. Dabrafenib and trametinib are FDA-approved as a combination therapy, which has superior clinical outcomes compared to dabrafenib or trametinib monotherapy (PMID: 25399551, 25265492). Additionally, patients with melanoma treated with dabrafenib and trametinib in both the neoadjuvant and adjuvant settings had improved survival over patients given standard of care (PMID: 29361468).", # noqa: E501 + "description": "Trametinib is an oral small molecule inhibitor of MEK1/2 that is FDA-approved alone or with dabrafenib for the treatment of patients with metastatic melanoma harboring a V600E or V600K BRAF mutation. In an open-label, randomized Phase III trial, patients with BRAF V600E/K-mutated unresectable, metastatic melanoma received oral trametinib (2 mg once daily) or an intravenous regimen of either dacarbazine (1000 mg/m2) or paclitaxel (175 mg/m2) every three weeks. Trametinib demonstrated improved progression-free survival (HR for disease progression or death = 0.45) and six-month overall survival (81% vs. 67%; death HR = 0.54; p=0.01) (PMID: 22663011). However, like other MEK inhibitors, the benefit of trametinib is limited by adverse reactions, most notably grade three or four rash and diarrhea (PMID: 22663011). Trametinib is not typically used as monotherapy for patients with BRAF V600K melanoma given its lower response rate compared to BRAF inhibitors and combined BRAF and MEK inhibitors. Patients previously treated with a RAF inhibitor appear to be less likely than untreated patients to respond to trametinib treatment (PMID: 22663011), and FDA guidelines state that trametinib as a monotherapy is not indicated for these patients. Dabrafenib and trametinib are FDA-approved as a combination therapy, which has superior clinical outcomes compared to dabrafenib or trametinib monotherapy (PMID: 25399551, 25265492). Additionally, patients with melanoma treated with dabrafenib and trametinib in both the neoadjuvant and adjuvant settings had improved survival over patients given standard of care (PMID: 29361468).", "evidence_level": oncokb_therapeutic_statement1["evidence_level"], "proposition": oncokb_therapeutic_proposition1, "variation_descriptor": vd, "disease_descriptor": oncokb_mel_disease_descriptor, "therapy_descriptor": oncokb_trametinib_therapy_descriptor, "method": oncokb_method, - "supported_by": oncokb_therapeutic1_documents_query + "supported_by": oncokb_therapeutic1_documents_query, } @@ -150,8 +178,13 @@ def assert_no_match(response): def check_statement_assertions( - actual, test, check_proposition, check_variation_descriptor, - check_descriptor, check_method): + actual, + test, + check_proposition, + check_variation_descriptor, + check_descriptor, + check_method, +): """Check that statement response is correct""" for key in {"id", "type", "evidence_level", "method"}: assert actual[key] == test[key], key @@ -163,13 +196,15 @@ def check_statement_assertions( assert key not in actual.keys(), key check_proposition(actual["proposition"], test["proposition"]) - check_variation_descriptor(actual["variation_descriptor"], - test["variation_descriptor"], - check_descriptor=check_descriptor, nested=True) + check_variation_descriptor( + actual["variation_descriptor"], + test["variation_descriptor"], + check_descriptor=check_descriptor, + nested=True, + ) check_descriptor(actual["disease_descriptor"], test["disease_descriptor"]) if test.get("therapy_descriptor"): - check_descriptor(actual["therapy_descriptor"], - test["therapy_descriptor"]) + check_descriptor(actual["therapy_descriptor"], test["therapy_descriptor"]) else: assert actual.get("therapy_descriptor") is None check_method(actual["method"], test["method"]) @@ -180,23 +215,39 @@ def check_statement_assertions( @pytest.mark.asyncio async def test_civic_eid2997( - query_handler, civic_eid2997, check_proposition, - check_variation_descriptor, check_descriptor, check_method): + query_handler, + civic_eid2997, + check_proposition, + check_variation_descriptor, + check_descriptor, + check_method, +): """Test that search_statements works correctly for CIVIC EID2997""" resp = await query_handler.search_statements(statement_id="civic.eid:2997") assert len(resp["statements"]) == 1 assert resp["matches"]["statements"] == ["civic.eid:2997"] assert len(resp["matches"]["propositions"]) == 1 check_statement_assertions( - resp["statements"][0], civic_eid2997, check_proposition, - check_variation_descriptor, check_descriptor, check_method) + resp["statements"][0], + civic_eid2997, + check_proposition, + check_variation_descriptor, + check_descriptor, + check_method, + ) assert resp["warnings"] == [] @pytest.mark.asyncio async def test_civic_aid6( - query_handler, civic_aid6, civic_eid2997, check_proposition, - check_variation_descriptor, check_descriptor, check_method): + query_handler, + civic_aid6, + civic_eid2997, + check_proposition, + check_variation_descriptor, + check_descriptor, + check_method, +): """Test that search_statements works correctly for CIVIC EID2997""" resp = await query_handler.search_statements(statement_id="civic.aid:6") assert len(resp["statements"]) == 7 @@ -209,62 +260,102 @@ async def test_civic_aid6( for s in resp["statements"]: if s["id"] == "civic.eid:2997": check_statement_assertions( - s, civic_eid2997, check_proposition, - check_variation_descriptor, check_descriptor, check_method) + s, + civic_eid2997, + check_proposition, + check_variation_descriptor, + check_descriptor, + check_method, + ) found_eid2997 = True elif s["id"] == "civic.aid:6": check_statement_assertions( - s, civic_aid6, check_proposition, - check_variation_descriptor, check_descriptor, check_method) + s, + civic_aid6, + check_proposition, + check_variation_descriptor, + check_descriptor, + check_method, + ) found_aid6 = True assert found_eid2997 assert found_aid6 @pytest.mark.asyncio -async def test_moa(query_handler, moa_aid71, check_proposition, - check_variation_descriptor, check_descriptor, check_method): +async def test_moa( + query_handler, + moa_aid71, + check_proposition, + check_variation_descriptor, + check_descriptor, + check_method, +): """Test that search_statements works correctly for MOA Assertion 71""" - resp = await query_handler.search_statements( - statement_id="moa.assertion:71") + resp = await query_handler.search_statements(statement_id="moa.assertion:71") assert len(resp["statements"]) == 1 check_statement_assertions( - resp["statements"][0], moa_aid71, check_proposition, - check_variation_descriptor, check_descriptor, check_method) + resp["statements"][0], + moa_aid71, + check_proposition, + check_variation_descriptor, + check_descriptor, + check_method, + ) assert resp["warnings"] == [] @pytest.mark.asyncio async def test_oncokb_diagnostic( - query_handler, oncokb_diagnostic1, check_proposition, check_variation_descriptor, - check_descriptor, check_method + query_handler, + oncokb_diagnostic1, + check_proposition, + check_variation_descriptor, + check_descriptor, + check_method, ): """Test that search_statements works correctly for OncoKB Diagnostic evidence for BRAF V600E """ resp = await query_handler.search_statements( - statement_id=oncokb_diagnostic1["id"], variation="BRAF V600E") + statement_id=oncokb_diagnostic1["id"], variation="BRAF V600E" + ) assert len(resp["statements"]) == 1 check_statement_assertions( - resp["statements"][0], oncokb_diagnostic1, check_proposition, - check_variation_descriptor, check_descriptor, check_method) + resp["statements"][0], + oncokb_diagnostic1, + check_proposition, + check_variation_descriptor, + check_descriptor, + check_method, + ) assert resp["warnings"] == [] @pytest.mark.asyncio async def test_oncokb_therapeutic( - query_handler, oncokb_therapeutic1, check_proposition, check_variation_descriptor, - check_descriptor, check_method + query_handler, + oncokb_therapeutic1, + check_proposition, + check_variation_descriptor, + check_descriptor, + check_method, ): """Test that search_statements works correctly for OncoKB Therapeutic evidence for BRAF V600E """ resp = await query_handler.search_statements( - statement_id=oncokb_therapeutic1["id"], variation="BRAF V600E") + statement_id=oncokb_therapeutic1["id"], variation="BRAF V600E" + ) assert len(resp["statements"]) == 1 check_statement_assertions( - resp["statements"][0], oncokb_therapeutic1, check_proposition, - check_variation_descriptor, check_descriptor, check_method) + resp["statements"][0], + oncokb_therapeutic1, + check_proposition, + check_variation_descriptor, + check_descriptor, + check_method, + ) assert resp["warnings"] == [] @@ -286,7 +377,8 @@ async def test_no_matches(query_handler): """Test invalid queries""" # invalid vrs variation prefix resp = await query_handler.search_statements( - variation="ga4gh:variation.kgjrhgf84CEndyLjKdAO0RxN-e3pJjxA") + variation="ga4gh:variation.kgjrhgf84CEndyLjKdAO0RxN-e3pJjxA" + ) assert_no_match(resp) # invalid id diff --git a/tests/unit/transform/test_civic_transform_diagnostic.py b/tests/unit/transform/test_civic_transform_diagnostic.py index c2699627..ee8904e5 100644 --- a/tests/unit/transform/test_civic_transform_diagnostic.py +++ b/tests/unit/transform/test_civic_transform_diagnostic.py @@ -1,9 +1,10 @@ """Test CIViC Transformation to common data model for prognostic.""" +import json + import pytest import pytest_asyncio -from metakb.transform.civic import CIViCTransform from metakb import PROJECT_ROOT -import json +from metakb.transform.civic import CIViCTransform DATA_DIR = PROJECT_ROOT / "tests" / "data" / "transform" / "diagnostic" FILENAME = "civic_cdm.json" @@ -14,8 +15,9 @@ async def data(normalizers): """Create a CIViC Transform test fixture.""" harvester_path = DATA_DIR / "civic_harvester.json" - c = CIViCTransform(data_dir=DATA_DIR, harvester_path=harvester_path, - normalizers=normalizers) + c = CIViCTransform( + data_dir=DATA_DIR, harvester_path=harvester_path, normalizers=normalizers + ) await c.transform() c.create_json(transform_dir=DATA_DIR, filename=FILENAME) with open(DATA_DIR / FILENAME, "r") as f: @@ -24,19 +26,17 @@ async def data(normalizers): @pytest.fixture(scope="module") -def statements(civic_eid2_statement, civic_eid74_statement, - civic_aid9_statement): +def statements(civic_eid2_statement, civic_eid74_statement, civic_aid9_statement): """Create test fixture for statements.""" return [civic_eid2_statement, civic_eid74_statement, civic_aid9_statement] @pytest.fixture(scope="module") -def propositions(civic_eid2_proposition, civic_eid74_proposition, - civic_aid9_proposition): +def propositions( + civic_eid2_proposition, civic_eid74_proposition, civic_aid9_proposition +): """Create test fixture for proposition.""" - return [ - civic_eid2_proposition, civic_eid74_proposition, civic_aid9_proposition - ] + return [civic_eid2_proposition, civic_eid74_proposition, civic_aid9_proposition] @pytest.fixture(scope="module") @@ -63,17 +63,39 @@ def documents(pmid_15146165, pmid_18073307): return [pmid_15146165, pmid_18073307] -def test_civic_cdm(data, statements, propositions, variation_descriptors, - gene_descriptors, disease_descriptors, - civic_methods, documents, check_statement, - check_proposition, check_variation_descriptor, - check_descriptor, check_document, check_method, - check_transformed_cdm): +def test_civic_cdm( + data, + statements, + propositions, + variation_descriptors, + gene_descriptors, + disease_descriptors, + civic_methods, + documents, + check_statement, + check_proposition, + check_variation_descriptor, + check_descriptor, + check_document, + check_method, + check_transformed_cdm, +): """Test that civic transform works correctly.""" check_transformed_cdm( - data, statements, propositions, variation_descriptors, - gene_descriptors, disease_descriptors, None, - civic_methods, documents, check_statement, check_proposition, - check_variation_descriptor, check_descriptor, check_document, - check_method, DATA_DIR / FILENAME + data, + statements, + propositions, + variation_descriptors, + gene_descriptors, + disease_descriptors, + None, + civic_methods, + documents, + check_statement, + check_proposition, + check_variation_descriptor, + check_descriptor, + check_document, + check_method, + DATA_DIR / FILENAME, ) diff --git a/tests/unit/transform/test_civic_transform_prognostic.py b/tests/unit/transform/test_civic_transform_prognostic.py index 15cbacaf..2eb8ebec 100644 --- a/tests/unit/transform/test_civic_transform_prognostic.py +++ b/tests/unit/transform/test_civic_transform_prognostic.py @@ -1,9 +1,10 @@ """Test CIViC Transformation to common data model for prognostic.""" +import json + import pytest import pytest_asyncio -from metakb.transform.civic import CIViCTransform from metakb import PROJECT_ROOT -import json +from metakb.transform.civic import CIViCTransform DATA_DIR = PROJECT_ROOT / "tests" / "data" / "transform" / "prognostic" FILENAME = "civic_cdm.json" @@ -14,8 +15,9 @@ async def data(normalizers): """Create a CIViC Transform test fixture.""" harvester_path = DATA_DIR / "civic_harvester.json" - c = CIViCTransform(data_dir=DATA_DIR, harvester_path=harvester_path, - normalizers=normalizers) + c = CIViCTransform( + data_dir=DATA_DIR, harvester_path=harvester_path, normalizers=normalizers + ) await c.transform() c.create_json(transform_dir=DATA_DIR, filename=FILENAME) with open(DATA_DIR / FILENAME, "r") as f: @@ -59,17 +61,39 @@ def documents(pmid_16384925, pmid_27819322): return [pmid_16384925, pmid_27819322] -def test_civic_cdm(data, statements, propositions, variation_descriptors, - gene_descriptors, disease_descriptors, - civic_methods, documents, check_statement, - check_proposition, check_variation_descriptor, - check_descriptor, check_document, check_method, - check_transformed_cdm): +def test_civic_cdm( + data, + statements, + propositions, + variation_descriptors, + gene_descriptors, + disease_descriptors, + civic_methods, + documents, + check_statement, + check_proposition, + check_variation_descriptor, + check_descriptor, + check_document, + check_method, + check_transformed_cdm, +): """Test that civic transform works correctly.""" check_transformed_cdm( - data, statements, propositions, variation_descriptors, - gene_descriptors, disease_descriptors, None, - civic_methods, documents, check_statement, check_proposition, - check_variation_descriptor, check_descriptor, check_document, - check_method, DATA_DIR / FILENAME + data, + statements, + propositions, + variation_descriptors, + gene_descriptors, + disease_descriptors, + None, + civic_methods, + documents, + check_statement, + check_proposition, + check_variation_descriptor, + check_descriptor, + check_document, + check_method, + DATA_DIR / FILENAME, ) diff --git a/tests/unit/transform/test_civic_transform_therapeutic.py b/tests/unit/transform/test_civic_transform_therapeutic.py index ec56bb59..678c4fd5 100644 --- a/tests/unit/transform/test_civic_transform_therapeutic.py +++ b/tests/unit/transform/test_civic_transform_therapeutic.py @@ -1,10 +1,10 @@ """Test CIViC Transformation to common data model for Therapeutic Response.""" +import json + import pytest import pytest_asyncio -from metakb.transform.civic import CIViCTransform from metakb import PROJECT_ROOT -import json - +from metakb.transform.civic import CIViCTransform DATA_DIR = PROJECT_ROOT / "tests" / "data" / "transform" / "therapeutic" FILENAME = "civic_cdm.json" @@ -15,8 +15,9 @@ async def data(normalizers): """Create a CIViC Transform test fixture.""" harvester_path = DATA_DIR / "civic_harvester.json" - c = CIViCTransform(data_dir=DATA_DIR, harvester_path=harvester_path, - normalizers=normalizers) + c = CIViCTransform( + data_dir=DATA_DIR, harvester_path=harvester_path, normalizers=normalizers + ) await c.transform() c.create_json(transform_dir=DATA_DIR, filename=FILENAME) with open(DATA_DIR / FILENAME, "r") as f: @@ -66,17 +67,40 @@ def documents(pmid_23982599, civic_aid6_document): return [pmid_23982599, civic_aid6_document] -def test_civic_cdm(data, statements, propositions, variation_descriptors, - gene_descriptors, disease_descriptors, therapy_descriptors, - civic_methods, documents, check_statement, - check_proposition, check_variation_descriptor, - check_descriptor, check_document, check_method, - check_transformed_cdm): +def test_civic_cdm( + data, + statements, + propositions, + variation_descriptors, + gene_descriptors, + disease_descriptors, + therapy_descriptors, + civic_methods, + documents, + check_statement, + check_proposition, + check_variation_descriptor, + check_descriptor, + check_document, + check_method, + check_transformed_cdm, +): """Test that civic transform works correctly.""" check_transformed_cdm( - data, statements, propositions, variation_descriptors, - gene_descriptors, disease_descriptors, therapy_descriptors, - civic_methods, documents, check_statement, check_proposition, - check_variation_descriptor, check_descriptor, check_document, - check_method, DATA_DIR / FILENAME + data, + statements, + propositions, + variation_descriptors, + gene_descriptors, + disease_descriptors, + therapy_descriptors, + civic_methods, + documents, + check_statement, + check_proposition, + check_variation_descriptor, + check_descriptor, + check_document, + check_method, + DATA_DIR / FILENAME, ) diff --git a/tests/unit/transform/test_moa_transform.py b/tests/unit/transform/test_moa_transform.py index e530a47c..7a925419 100644 --- a/tests/unit/transform/test_moa_transform.py +++ b/tests/unit/transform/test_moa_transform.py @@ -1,9 +1,10 @@ """Test MOA Transformation to common data model""" +import json + import pytest import pytest_asyncio -from metakb.transform.moa import MOATransform from metakb import PROJECT_ROOT -import json +from metakb.transform.moa import MOATransform DATA_DIR = PROJECT_ROOT / "tests" / "data" / "transform" FILENAME = "moa_cdm.json" @@ -14,8 +15,9 @@ async def data(normalizers): """Create a MOA Transform test fixture.""" harvester_path = DATA_DIR / "moa_harvester.json" - moa = MOATransform(data_dir=DATA_DIR, harvester_path=harvester_path, - normalizers=normalizers) + moa = MOATransform( + data_dir=DATA_DIR, harvester_path=harvester_path, normalizers=normalizers + ) await moa.transform() moa.create_json(transform_dir=DATA_DIR, filename=FILENAME) with open(DATA_DIR / FILENAME, "r") as f: @@ -62,7 +64,7 @@ def asst71_disease_descriptors(moa_chronic_myelogenous_leukemia): @pytest.fixture(scope="module") def asst71_methods(method4): """Create assertion71 methods test fixture.""" - return[method4] + return [method4] @pytest.fixture(scope="module") @@ -71,19 +73,40 @@ def asst71_documents(pmid_11423618): return [pmid_11423618] -def test_moa_cdm(data, asst71_statements, asst71_propositions, - asst71_variation_descriptors, asst71_gene_descriptors, - asst71_disease_descriptors, asst71_therapy_descriptors, - asst71_methods, asst71_documents, check_statement, - check_proposition, check_variation_descriptor, - check_descriptor, check_document, check_method, - check_transformed_cdm): +def test_moa_cdm( + data, + asst71_statements, + asst71_propositions, + asst71_variation_descriptors, + asst71_gene_descriptors, + asst71_disease_descriptors, + asst71_therapy_descriptors, + asst71_methods, + asst71_documents, + check_statement, + check_proposition, + check_variation_descriptor, + check_descriptor, + check_document, + check_method, + check_transformed_cdm, +): """Test that moa transform works correctly.""" check_transformed_cdm( - data, asst71_statements, asst71_propositions, - asst71_variation_descriptors, asst71_gene_descriptors, - asst71_disease_descriptors, asst71_therapy_descriptors, asst71_methods, - asst71_documents, check_statement, check_proposition, - check_variation_descriptor, check_descriptor, check_document, - check_method, DATA_DIR / FILENAME + data, + asst71_statements, + asst71_propositions, + asst71_variation_descriptors, + asst71_gene_descriptors, + asst71_disease_descriptors, + asst71_therapy_descriptors, + asst71_methods, + asst71_documents, + check_statement, + check_proposition, + check_variation_descriptor, + check_descriptor, + check_document, + check_method, + DATA_DIR / FILENAME, ) diff --git a/tests/unit/transform/test_oncokb_transform.py b/tests/unit/transform/test_oncokb_transform.py index d4e873ef..aa98a270 100644 --- a/tests/unit/transform/test_oncokb_transform.py +++ b/tests/unit/transform/test_oncokb_transform.py @@ -6,10 +6,8 @@ import pytest import pytest_asyncio - -from metakb.transform.oncokb import OncoKBTransform from metakb import PROJECT_ROOT - +from metakb.transform.oncokb import OncoKBTransform DATA_DIR = PROJECT_ROOT / "tests" / "data" / "transform" FILENAME = "oncokb_cdm.json" @@ -20,8 +18,9 @@ async def data(normalizers): """Create a OncoKB Transform test fixture.""" harvester_path = DATA_DIR / "oncokb_harvester.json" - o = OncoKBTransform(data_dir=DATA_DIR, harvester_path=harvester_path, - normalizers=normalizers) + o = OncoKBTransform( + data_dir=DATA_DIR, harvester_path=harvester_path, normalizers=normalizers + ) await o.transform() o.create_json(transform_dir=DATA_DIR, filename=FILENAME) with open(DATA_DIR / FILENAME, "r") as f: @@ -36,8 +35,9 @@ def oncokb_statements(oncokb_diagnostic_statement1, oncokb_therapeutic_statement @pytest.fixture(scope="module") -def oncokb_propositions(oncokb_diagnostic_proposition1, - oncokb_therapeutic_proposition1): +def oncokb_propositions( + oncokb_diagnostic_proposition1, oncokb_therapeutic_proposition1 +): """Create OncoKB propositions test fixture""" return [oncokb_diagnostic_proposition1, oncokb_therapeutic_proposition1] @@ -61,8 +61,9 @@ def oncokb_variation_descriptors(oncokb_braf_v600e_vd): @pytest.fixture(scope="module") -def oncokb_disease_descriptors(oncokb_ecd_disease_descriptor, - oncokb_mel_disease_descriptor): +def oncokb_disease_descriptors( + oncokb_ecd_disease_descriptor, oncokb_mel_disease_descriptor +): """Create OncoKB disease descriptors test fixture""" return [oncokb_ecd_disease_descriptor, oncokb_mel_disease_descriptor] @@ -77,26 +78,10 @@ def oncokb_methods(oncokb_method): def oncokb_therapeutic1_documents(): """Create test fixture for OncoKB therapeutic evidence 1 documents""" return [ - { - "id": "pmid:29361468", - "label": "PubMed 29361468", - "type": "Document" - }, - { - "id": "pmid:25399551", - "label": "PubMed 25399551", - "type": "Document" - }, - { - "id": "pmid:22663011", - "label": "PubMed 22663011", - "type": "Document" - }, - { - "id": "pmid:25265492", - "label": "PubMed 25265492", - "type": "Document" - } + {"id": "pmid:29361468", "label": "PubMed 29361468", "type": "Document"}, + {"id": "pmid:25399551", "label": "PubMed 25399551", "type": "Document"}, + {"id": "pmid:22663011", "label": "PubMed 22663011", "type": "Document"}, + {"id": "pmid:25265492", "label": "PubMed 25265492", "type": "Document"}, ] @@ -107,17 +92,39 @@ def oncokb_documents(oncokb_diagnostic1_documents, oncokb_therapeutic1_documents def test_oncokb_transform( - data, oncokb_statements, oncokb_propositions, oncokb_variation_descriptors, - oncokb_gene_descriptors, oncokb_disease_descriptors, oncokb_therapy_descriptors, - oncokb_methods, oncokb_documents, check_statement, check_proposition, - check_variation_descriptor, check_descriptor, check_document, check_method, - check_transformed_cdm + data, + oncokb_statements, + oncokb_propositions, + oncokb_variation_descriptors, + oncokb_gene_descriptors, + oncokb_disease_descriptors, + oncokb_therapy_descriptors, + oncokb_methods, + oncokb_documents, + check_statement, + check_proposition, + check_variation_descriptor, + check_descriptor, + check_document, + check_method, + check_transformed_cdm, ): """Test that OncoKB transform works correctly""" check_transformed_cdm( - data, oncokb_statements, oncokb_propositions, oncokb_variation_descriptors, - oncokb_gene_descriptors, oncokb_disease_descriptors, oncokb_therapy_descriptors, - oncokb_methods, oncokb_documents, check_statement, check_proposition, - check_variation_descriptor, check_descriptor, check_document, check_method, - DATA_DIR / FILENAME + data, + oncokb_statements, + oncokb_propositions, + oncokb_variation_descriptors, + oncokb_gene_descriptors, + oncokb_disease_descriptors, + oncokb_therapy_descriptors, + oncokb_methods, + oncokb_documents, + check_statement, + check_proposition, + check_variation_descriptor, + check_descriptor, + check_document, + check_method, + DATA_DIR / FILENAME, )