From 253a477c32587093286297b885c3db0a69a224ad Mon Sep 17 00:00:00 2001 From: Florian Borchert Date: Fri, 5 Jan 2024 13:57:40 +0100 Subject: [PATCH 1/7] Update Examples --- ...02_BRONCO.ipynb => 01_BRONCO_German.ipynb} | 11 - ...GGPONC_NER.ipynb => 02_spaCy_German.ipynb} | 148 +-- examples/03_SNOMED_Linking_German.ipynb | 1116 +++++++++++++++++ examples/README.md | 13 + examples/Temp.ipynb | 216 ---- examples/conf/meddra_german.yaml | 8 +- examples/conf/snomed_german.yaml | 24 + examples/dicts/umls_source.py | 35 +- examples/ggponc2tui.csv | 128 -- examples/ggponc_tuis.csv | 414 ++++++ .../{ => old_examples}/conf/distemist.yaml | 0 .../distemist}/01_BioASQ_DisTEMIST.ipynb | 0 .../distemist}/distemist.py | 0 .../distemist}/distemist_bioasq.yaml | 0 .../distemist}/notebook_util.py | 0 examples/util.py | 15 + 16 files changed, 1671 insertions(+), 457 deletions(-) rename examples/{02_BRONCO.ipynb => 01_BRONCO_German.ipynb} (99%) rename examples/{03_GGPONC_NER.ipynb => 02_spaCy_German.ipynb} (93%) create mode 100644 examples/03_SNOMED_Linking_German.ipynb create mode 100644 examples/README.md delete mode 100644 examples/Temp.ipynb create mode 100644 examples/conf/snomed_german.yaml delete mode 100644 examples/ggponc2tui.csv create mode 100644 examples/ggponc_tuis.csv rename examples/{ => old_examples}/conf/distemist.yaml (100%) rename examples/{ => old_examples/distemist}/01_BioASQ_DisTEMIST.ipynb (100%) rename examples/{dicts => old_examples/distemist}/distemist.py (100%) rename examples/{ => old_examples/distemist}/distemist_bioasq.yaml (100%) rename examples/{ => old_examples/distemist}/notebook_util.py (100%) create mode 100644 examples/util.py diff --git a/examples/02_BRONCO.ipynb b/examples/01_BRONCO_German.ipynb similarity index 99% rename from examples/02_BRONCO.ipynb rename to examples/01_BRONCO_German.ipynb index 1e0fcf4..6f4dcab 100644 --- a/examples/02_BRONCO.ipynb +++ b/examples/01_BRONCO_German.ipynb @@ -1,16 +1,5 @@ { "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "0271244e-11b2-44ab-82e8-cf4cdcad6b13", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "os.environ['CUDA_VISIBLE_DEVICES'] = '5'" - ] - }, { "cell_type": "markdown", "id": "eb84db40-b21a-4fba-8174-a327a831ef76", diff --git a/examples/03_GGPONC_NER.ipynb b/examples/02_spaCy_German.ipynb similarity index 93% rename from examples/03_GGPONC_NER.ipynb rename to examples/02_spaCy_German.ipynb index 7725e73..3f768eb 100644 --- a/examples/03_GGPONC_NER.ipynb +++ b/examples/02_spaCy_German.ipynb @@ -5,7 +5,7 @@ "id": "6a3e8444-54ec-41a2-aaef-3849a22cde57", "metadata": {}, "source": [ - "# Combinining NER models with xMEN for German Clinical Entity Linking" + "# Combinining spaCy NER models with xMEN for German Clinical Entity Linking" ] }, { @@ -21,7 +21,7 @@ "id": "68020a8a-9c53-4f75-a08e-5078ec5b7f35", "metadata": {}, "source": [ - "### Download NER model" + "### Download NER Model" ] }, { @@ -70,7 +70,7 @@ "id": "407e49a2-7a40-4982-bf1b-d15de505d039", "metadata": {}, "source": [ - "# Run GGPONC NER Model on sample data" + "## Run spaCy NER Model on Sample Data" ] }, { @@ -290,7 +290,7 @@ "id": "0ae82793-8135-41a2-b6f5-75e39e1e57b1", "metadata": {}, "source": [ - "# Run Entity Linker" + "## Candidate Generation" ] }, { @@ -381,11 +381,11 @@ { "data": { "text/html": [ - "
[01/04/24 15:04:26] INFO     Loading hierarchical faiss index                                sap_bert_linker.py:153\n",
+       "
[01/05/24 13:54:34] INFO     Loading hierarchical faiss index                                sap_bert_linker.py:153\n",
        "
\n" ], "text/plain": [ - "\u001b[2;36m[01/04/24 15:04:26]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Loading hierarchical faiss index \u001b]8;id=605687;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/sap_bert_linker.py\u001b\\\u001b[2msap_bert_linker.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=745349;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/sap_bert_linker.py#153\u001b\\\u001b[2m153\u001b[0m\u001b]8;;\u001b\\\n" + "\u001b[2;36m[01/05/24 13:54:34]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Loading hierarchical faiss index \u001b]8;id=378882;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/sap_bert_linker.py\u001b\\\u001b[2msap_bert_linker.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=337023;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/sap_bert_linker.py#153\u001b\\\u001b[2m153\u001b[0m\u001b]8;;\u001b\\\n" ] }, "metadata": {}, @@ -400,7 +400,7 @@ "
\n" ], "text/plain": [ - "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Loading index from \u001b]8;id=911485;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py\u001b\\\u001b[2mfaiss_indexer.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=711460;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py#64\u001b\\\u001b[2m64\u001b[0m\u001b]8;;\u001b\\\n", + "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Loading index from \u001b]8;id=290282;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py\u001b\\\u001b[2mfaiss_indexer.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=834572;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py#64\u001b\\\u001b[2m64\u001b[0m\u001b]8;;\u001b\\\n", "\u001b[2;36m \u001b[0m \u001b[35m/home/Florian.Borchert/.cache/xmen/ggponc/index/sapbert/\u001b[0m\u001b[95membed_fais\u001b[0m \u001b[2m \u001b[0m\n", "\u001b[2;36m \u001b[0m \u001b[95ms_hier.pickle\u001b[0m \u001b[2m \u001b[0m\n" ] @@ -411,12 +411,12 @@ { "data": { "text/html": [ - "
[01/04/24 15:04:30] INFO     Loaded index of type <class 'faiss.swigfaiss.IndexHNSWFlat'> and   faiss_indexer.py:66\n",
+       "
[01/05/24 13:54:39] INFO     Loaded index of type <class 'faiss.swigfaiss.IndexHNSWFlat'> and   faiss_indexer.py:66\n",
        "                             size 2906321                                                                          \n",
        "
\n" ], "text/plain": [ - "\u001b[2;36m[01/04/24 15:04:30]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Loaded index of type \u001b[1m<\u001b[0m\u001b[1;95mclass\u001b[0m\u001b[39m \u001b[0m\u001b[32m'faiss.swigfaiss.IndexHNSWFlat'\u001b[0m\u001b[1m>\u001b[0m and \u001b]8;id=874363;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py\u001b\\\u001b[2mfaiss_indexer.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=363764;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py#66\u001b\\\u001b[2m66\u001b[0m\u001b]8;;\u001b\\\n", + "\u001b[2;36m[01/05/24 13:54:39]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Loaded index of type \u001b[1m<\u001b[0m\u001b[1;95mclass\u001b[0m\u001b[39m \u001b[0m\u001b[32m'faiss.swigfaiss.IndexHNSWFlat'\u001b[0m\u001b[1m>\u001b[0m and \u001b]8;id=653504;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py\u001b\\\u001b[2mfaiss_indexer.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=376608;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py#66\u001b\\\u001b[2m66\u001b[0m\u001b]8;;\u001b\\\n", "\u001b[2;36m \u001b[0m size \u001b[1;36m2906321\u001b[0m \u001b[2m \u001b[0m\n" ] }, @@ -438,7 +438,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "e3e916549d2f43c6a2d5414996f10447", + "model_id": "6bfe4331629c4563b9f39cf1a1e05481", "version_major": 2, "version_minor": 0 }, @@ -463,7 +463,7 @@ "id": "b3689a61-2ee1-4c81-afc7-6f6751654020", "metadata": {}, "source": [ - "## Semantic Type Filtering\n", + "### Semantic Type Filtering\n", "\n", "We filter the generated output to make sure the semantic type of the predicted concepts actually matches the semantic class of the named entity.\n", "\n", @@ -502,33 +502,22 @@ "metadata": {}, "outputs": [], "source": [ - "tui_df = pd.read_csv('ggponc2tui.csv')\n", - "type2tui = {}\n", - "for c in ['Diagnosis_or_Pathology', 'Other_Finding', 'Clinical_Drug', 'Nutrient_or_Body_Substance',\n", - " 'External_Substance', 'Therapeutic', 'Diagnostic']:\n", - " type2tui[c] = list(tui_df.TUI[tui_df[c] == 'x'].values)" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "0b7f48f2-9b8d-41f7-a526-49b9ff318170", - "metadata": {}, - "outputs": [], - "source": [ + "from xmen.data import SemanticTypeFilter\n", + "\n", + "type2tui = pd.read_csv('ggponc_tuis.csv').groupby('class')['tui'].apply(list).to_dict()\n", "type_filter = SemanticTypeFilter(type2tui, kb)" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 18, "id": "14ebda62-a868-4bc9-8c84-a0580935b0c7", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "c5305b0a8d1545d093edfd187ae62333", + "model_id": "fe0b1a358a4d4df39539c448e28845b4", "version_major": 2, "version_minor": 0 }, @@ -546,7 +535,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 19, "id": "308b40e7-d97a-4b58-b933-ecc8083fcc97", "metadata": {}, "outputs": [ @@ -574,7 +563,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 20, "id": "bc358d6c-c1a6-4b8a-9839-3a527c3f2a67", "metadata": {}, "outputs": [ @@ -600,40 +589,10 @@ "print(kb.cui_to_entity['C1739039'])" ] }, - { - "cell_type": "markdown", - "id": "b43d94d5-63b6-478a-8bc6-fdee335c9046", - "metadata": {}, - "source": [ - "## Output" - ] - }, { "cell_type": "code", - "execution_count": 22, - "id": "9b0c2d1d-727b-4f52-90be-dc468cf0f210", - "metadata": {}, - "outputs": [], - "source": [ - "def get_dataframe(predictions):\n", - " ents = []\n", - " for d in predictions:\n", - " for e in d['entities']:\n", - " span = ' '.join(e['text'])\n", - " label = e['type']\n", - " top_concept = e['normalized'][0] if len(e['normalized']) > 0 else None \n", - " if top_concept:\n", - " cui = top_concept['db_id']\n", - " ents.append({'mention' : span, 'class' : label, 'cui' : cui, 'canonical name' : kb.cui_to_entity[cui].canonical_name, 'linked by' : top_concept['predicted_by'], 'score' : top_concept['score']})\n", - " else:\n", - " ents.append({'mention' : span, 'class' : label, 'cui' : 'Not linkable'})\n", - " return pd.DataFrame(ents)" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "c3a659c4-0371-4b6f-9544-8b1fc6615c61", + "execution_count": 21, + "id": "c02ef4fd-dc7f-487d-b059-8643b3886ba5", "metadata": {}, "outputs": [ { @@ -846,13 +805,14 @@ "13 Plattenepithelkarzinom der Mundhoehle [sap] 0.987126 " ] }, - "execution_count": 23, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "get_dataframe(filtered_prediction)" + "from util import get_dataframe\n", + "get_dataframe(filtered_prediction, kb)" ] }, { @@ -865,7 +825,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 22, "id": "9b7b5c5f-e617-4975-822a-fa6bce38397a", "metadata": {}, "outputs": [ @@ -884,11 +844,11 @@ { "data": { "text/html": [ - "
[01/04/24 15:04:58] INFO     Loading hierarchical faiss index                                sap_bert_linker.py:153\n",
+       "
[01/05/24 13:55:12] INFO     Loading hierarchical faiss index                                sap_bert_linker.py:153\n",
        "
\n" ], "text/plain": [ - "\u001b[2;36m[01/04/24 15:04:58]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Loading hierarchical faiss index \u001b]8;id=693868;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/sap_bert_linker.py\u001b\\\u001b[2msap_bert_linker.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=370078;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/sap_bert_linker.py#153\u001b\\\u001b[2m153\u001b[0m\u001b]8;;\u001b\\\n" + "\u001b[2;36m[01/05/24 13:55:12]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Loading hierarchical faiss index \u001b]8;id=29828;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/sap_bert_linker.py\u001b\\\u001b[2msap_bert_linker.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=247780;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/sap_bert_linker.py#153\u001b\\\u001b[2m153\u001b[0m\u001b]8;;\u001b\\\n" ] }, "metadata": {}, @@ -903,7 +863,7 @@ "
\n" ], "text/plain": [ - "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Loading index from \u001b]8;id=347044;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py\u001b\\\u001b[2mfaiss_indexer.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=137374;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py#64\u001b\\\u001b[2m64\u001b[0m\u001b]8;;\u001b\\\n", + "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Loading index from \u001b]8;id=165201;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py\u001b\\\u001b[2mfaiss_indexer.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=284277;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py#64\u001b\\\u001b[2m64\u001b[0m\u001b]8;;\u001b\\\n", "\u001b[2;36m \u001b[0m \u001b[35m/home/Florian.Borchert/.cache/xmen/ggponc/index/sapbert/\u001b[0m\u001b[95membed_fais\u001b[0m \u001b[2m \u001b[0m\n", "\u001b[2;36m \u001b[0m \u001b[95ms_hier.pickle\u001b[0m \u001b[2m \u001b[0m\n" ] @@ -914,12 +874,12 @@ { "data": { "text/html": [ - "
[01/04/24 15:05:02] INFO     Loaded index of type <class 'faiss.swigfaiss.IndexHNSWFlat'> and   faiss_indexer.py:66\n",
+       "
[01/05/24 13:55:17] INFO     Loaded index of type <class 'faiss.swigfaiss.IndexHNSWFlat'> and   faiss_indexer.py:66\n",
        "                             size 2906321                                                                          \n",
        "
\n" ], "text/plain": [ - "\u001b[2;36m[01/04/24 15:05:02]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Loaded index of type \u001b[1m<\u001b[0m\u001b[1;95mclass\u001b[0m\u001b[39m \u001b[0m\u001b[32m'faiss.swigfaiss.IndexHNSWFlat'\u001b[0m\u001b[1m>\u001b[0m and \u001b]8;id=496193;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py\u001b\\\u001b[2mfaiss_indexer.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=514230;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py#66\u001b\\\u001b[2m66\u001b[0m\u001b]8;;\u001b\\\n", + "\u001b[2;36m[01/05/24 13:55:17]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Loaded index of type \u001b[1m<\u001b[0m\u001b[1;95mclass\u001b[0m\u001b[39m \u001b[0m\u001b[32m'faiss.swigfaiss.IndexHNSWFlat'\u001b[0m\u001b[1m>\u001b[0m and \u001b]8;id=433500;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py\u001b\\\u001b[2mfaiss_indexer.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=574493;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py#66\u001b\\\u001b[2m66\u001b[0m\u001b]8;;\u001b\\\n", "\u001b[2;36m \u001b[0m size \u001b[1;36m2906321\u001b[0m \u001b[2m \u001b[0m\n" ] }, @@ -934,14 +894,14 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 23, "id": "411c0814-5fc5-4b70-ab96-ef07833cbf08", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "b766e64187554e65916a818539a62f07", + "model_id": "80d4a77a89a848309a1342e4d8846505", "version_major": 2, "version_minor": 0 }, @@ -955,7 +915,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "7b20de793c204aa2821b9c839ed74977", + "model_id": "b2be54262eab46b8963c3626a5c88891", "version_major": 2, "version_minor": 0 }, @@ -973,7 +933,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 24, "id": "8185c65f-5645-4c38-a11a-a32a4f24a2a1", "metadata": {}, "outputs": [], @@ -983,7 +943,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 25, "id": "07d41590-f991-474a-9109-9b75d3184266", "metadata": {}, "outputs": [ @@ -998,7 +958,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "720781390c534053b3b05f6d32f7ea6d", + "model_id": "c09461422f7a44f08091419edeb74ab0", "version_major": 2, "version_minor": 0 }, @@ -1012,7 +972,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "7c893f7acd7745efa971e2da94fff95f", + "model_id": "48de214e279947e685cc324463bd7dca", "version_major": 2, "version_minor": 0 }, @@ -1026,7 +986,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "4fbfd064903042359d6bd259c9885675", + "model_id": "fe4d431f45834f45b064a867540c59a6", "version_major": 2, "version_minor": 0 }, @@ -1040,7 +1000,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "624d4b4e391346cabc7fee7ee24505ba", + "model_id": "b38b631271854036853f2e192174765b", "version_major": 2, "version_minor": 0 }, @@ -1058,7 +1018,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 26, "id": "6613f881-9323-42c7-a63c-710c6a1d758c", "metadata": {}, "outputs": [], @@ -1068,14 +1028,14 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 27, "id": "3af8dfbf-14b1-4a47-b76b-14868cde55e6", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "0de5605717c84cbb9934f1f08b26abbf", + "model_id": "1c56209e5bbb4a3aa2a33b6db8358e64", "version_major": 2, "version_minor": 0 }, @@ -1089,7 +1049,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "06074a2114af4bc58fe6a7b67c3fd4a8", + "model_id": "7d680c9f293f46a8939818239df7132d", "version_major": 2, "version_minor": 0 }, @@ -1103,7 +1063,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "45751c18d9a7432f901ef25e98e0a110", + "model_id": "4238449d87694b36b0ec3116e43927c8", "version_major": 2, "version_minor": 0 }, @@ -1117,7 +1077,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "5de8ca1225eb497bb70504d0b213e7fc", + "model_id": "ec464f1617704cb7997d30ae9748a9c7", "version_major": 2, "version_minor": 0 }, @@ -1135,7 +1095,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 28, "id": "8bdfb06e-a38b-431d-b82a-c515e6a00078", "metadata": {}, "outputs": [ @@ -1349,19 +1309,19 @@ "13 Plattenepithelkarzinom der Mundhoehle [ngram, sapbert] 0.987126 " ] }, - "execution_count": 30, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Before Re-ranking\n", - "get_dataframe(candidates)" + "get_dataframe(candidates, kb)" ] }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 29, "id": "0f95ab2d-5f3b-432c-9214-64316b3f1e99", "metadata": {}, "outputs": [ @@ -1575,23 +1535,15 @@ "13 Plattenepithelkarzinom der Mundhoehle [ngram, sapbert] 0.042749 " ] }, - "execution_count": 31, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# After Re-ranking\n", - "get_dataframe(reranked)" + "get_dataframe(reranked, kb)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9ace9bf0-c0ef-4c9e-9bd6-6ecce3cf03b6", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/examples/03_SNOMED_Linking_German.ipynb b/examples/03_SNOMED_Linking_German.ipynb new file mode 100644 index 0000000..3f43598 --- /dev/null +++ b/examples/03_SNOMED_Linking_German.ipynb @@ -0,0 +1,1116 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "6a3e8444-54ec-41a2-aaef-3849a22cde57", + "metadata": {}, + "source": [ + "# Linking Entities in German Medical Text to SNOMED CT" + ] + }, + { + "cell_type": "markdown", + "id": "eace0372-6610-4a03-bab6-32e186911bac", + "metadata": {}, + "source": [ + "## Preparation\n", + "\n", + "### Download NER Model" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "50943603-e8d3-4157-a675-0d9f41f5479c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Consider using `hf_transfer` for faster downloads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details.\n", + "../local_files/de_ggponc_medbertde-any-py3-none-any.whl\n" + ] + } + ], + "source": [ + "!huggingface-cli download phlobo/de_ggponc_medbertde de_ggponc_medbertde-any-py3-none-any.whl --local-dir ../local_files\n", + "!pip install -q ../local_files/de_ggponc_medbertde-any-py3-none-any.whl" + ] + }, + { + "cell_type": "markdown", + "id": "f216b159-7112-473e-a09e-6eff33fa7a34", + "metadata": {}, + "source": [ + "### Prepare Dicts and Index\n", + "\n", + "`xmen dict conf/snomed_german.yaml --code dicts/umls_source.py`\n", + "\n", + "`xmen index conf/snomed_german.yaml --all --overwrite`" + ] + }, + { + "cell_type": "markdown", + "id": "407e49a2-7a40-4982-bf1b-d15de505d039", + "metadata": {}, + "source": [ + "## Entity Tagging" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "521b389e-ce31-4e1f-9e37-29986d475a82", + "metadata": {}, + "outputs": [], + "source": [ + "import spacy\n", + "nlp = spacy.load('de_ggponc_medbertde')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "8049cf19-4a70-4bac-af9a-416c4e7a60e5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Cetuximab ist ein monoklonaler Antikörper, der gegen den epidermalen Wachstumsfaktorrezeptor (EGFR) gerichtet ist unddient zur Therapie des fortgeschrittenen kolorektalen Karzinoms zusammen mit Irinotecan oder in Kombination mit FOLFOX bzw. allein nach Versagen einer Behandlung mit Oxaliplatin und Irinotecan.',\n", + " 'Die HPV-Diagnostik hat beim Plattenepithelkarzinom der Mundhöhle keinen validen Nutzen als prognostischer Faktor.']" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sentences = [\n", + " \"Cetuximab ist ein monoklonaler Antikörper, der gegen den epidermalen Wachstumsfaktorrezeptor (EGFR) gerichtet ist und\" \\\n", + " \"dient zur Therapie des fortgeschrittenen kolorektalen Karzinoms zusammen mit Irinotecan oder in Kombination mit FOLFOX bzw. \" \\\n", + " \"allein nach Versagen einer Behandlung mit Oxaliplatin und Irinotecan.\",\n", + " \"Die HPV-Diagnostik hat beim Plattenepithelkarzinom der Mundhöhle keinen validen Nutzen als prognostischer Faktor.\"\n", + "]\n", + "sentences" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "5feca36d-cff3-4e02-858e-8d46c0c3d579", + "metadata": {}, + "outputs": [], + "source": [ + "docs = list(nlp.pipe(sentences))" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "4a663fcf-c369-4e0c-9b90-c51fecba6019", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "bd0ab2ae-bd78-4c4f-bec7-e8fc119690e1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
mentionclass
0CetuximabClinical_Drug
1monoklonaler AntikörperClinical_Drug
2epidermalen WachstumsfaktorrezeptorNutrient_or_Body_Substance
3EGFRNutrient_or_Body_Substance
4Therapie des fortgeschrittenen kolorektalen Ka...Therapeutic
5fortgeschrittenen kolorektalen KarzinomsDiagnosis_or_Pathology
6IrinotecanClinical_Drug
7FOLFOXTherapeutic
8Versagen einer BehandlungOther_Finding
9Behandlung mit Oxaliplatin und IrinotecanTherapeutic
10OxaliplatinClinical_Drug
11IrinotecanClinical_Drug
12HPV-DiagnostikDiagnostic
13Plattenepithelkarzinom der MundhöhleDiagnosis_or_Pathology
\n", + "
" + ], + "text/plain": [ + " mention \\\n", + "0 Cetuximab \n", + "1 monoklonaler Antikörper \n", + "2 epidermalen Wachstumsfaktorrezeptor \n", + "3 EGFR \n", + "4 Therapie des fortgeschrittenen kolorektalen Ka... \n", + "5 fortgeschrittenen kolorektalen Karzinoms \n", + "6 Irinotecan \n", + "7 FOLFOX \n", + "8 Versagen einer Behandlung \n", + "9 Behandlung mit Oxaliplatin und Irinotecan \n", + "10 Oxaliplatin \n", + "11 Irinotecan \n", + "12 HPV-Diagnostik \n", + "13 Plattenepithelkarzinom der Mundhöhle \n", + "\n", + " class \n", + "0 Clinical_Drug \n", + "1 Clinical_Drug \n", + "2 Nutrient_or_Body_Substance \n", + "3 Nutrient_or_Body_Substance \n", + "4 Therapeutic \n", + "5 Diagnosis_or_Pathology \n", + "6 Clinical_Drug \n", + "7 Therapeutic \n", + "8 Other_Finding \n", + "9 Therapeutic \n", + "10 Clinical_Drug \n", + "11 Clinical_Drug \n", + "12 Diagnostic \n", + "13 Diagnosis_or_Pathology " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ents = []\n", + "for d in docs:\n", + " for span in sorted(d.spans['entities'], key=lambda s: s.start):\n", + " ents.append({'mention' : span.text, 'class' : span.label_})\n", + "pd.DataFrame(ents)" + ] + }, + { + "cell_type": "markdown", + "id": "0ae82793-8135-41a2-b6f5-75e39e1e57b1", + "metadata": {}, + "source": [ + "## Candidate Generation" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "f5b0add5-9fb7-4eb4-9495-91ee5a5c95ea", + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "\n", + "from xmen.data import from_spacy\n", + "from xmen.linkers import SapBERTLinker, TFIDFNGramLinker, EnsembleLinker\n", + "from xmen import load_config" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "46831d66-e521-4225-9b92-a40d7b7582e4", + "metadata": {}, + "outputs": [], + "source": [ + "dataset = from_spacy(docs, span_key='entities')" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "5abb3126-ba45-47b1-a1a6-912652fefa3b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Dataset({\n", + " features: ['id', 'document_id', 'passages', 'entities', 'coreferences', 'relations', 'events', 'corpus_id', 'lang'],\n", + " num_rows: 2\n", + "})" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "286a07dc-e5a3-42af-89f7-b48e035f0d25", + "metadata": {}, + "outputs": [], + "source": [ + "conf = load_config('../examples/conf/snomed_german.yaml')" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "ec6cd089-3a63-45e8-9178-86999e7f0441", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
[01/05/24 13:54:14] INFO     Loading hierarchical faiss index                                sap_bert_linker.py:153\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[2;36m[01/05/24 13:54:14]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Loading hierarchical faiss index \u001b]8;id=183098;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/sap_bert_linker.py\u001b\\\u001b[2msap_bert_linker.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=912029;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/sap_bert_linker.py#153\u001b\\\u001b[2m153\u001b[0m\u001b]8;;\u001b\\\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
                    INFO     Loading index from                                                 faiss_indexer.py:64\n",
+       "                             /home/Florian.Borchert/.cache/xmen/snomed_german/index/sapbert/emb                    \n",
+       "                             ed_faiss_hier.pickle                                                                  \n",
+       "
\n" + ], + "text/plain": [ + "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Loading index from \u001b]8;id=599606;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py\u001b\\\u001b[2mfaiss_indexer.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=458220;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py#64\u001b\\\u001b[2m64\u001b[0m\u001b]8;;\u001b\\\n", + "\u001b[2;36m \u001b[0m \u001b[35m/home/Florian.Borchert/.cache/xmen/snomed_german/index/sapbert/\u001b[0m\u001b[95memb\u001b[0m \u001b[2m \u001b[0m\n", + "\u001b[2;36m \u001b[0m \u001b[95med_faiss_hier.pickle\u001b[0m \u001b[2m \u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
[01/05/24 13:54:18] INFO     Loaded index of type <class 'faiss.swigfaiss.IndexHNSWFlat'> and   faiss_indexer.py:66\n",
+       "                             size 1967771                                                                          \n",
+       "
\n" + ], + "text/plain": [ + "\u001b[2;36m[01/05/24 13:54:18]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Loaded index of type \u001b[1m<\u001b[0m\u001b[1;95mclass\u001b[0m\u001b[39m \u001b[0m\u001b[32m'faiss.swigfaiss.IndexHNSWFlat'\u001b[0m\u001b[1m>\u001b[0m and \u001b]8;id=776198;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py\u001b\\\u001b[2mfaiss_indexer.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=995347;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py#66\u001b\\\u001b[2m66\u001b[0m\u001b]8;;\u001b\\\n", + "\u001b[2;36m \u001b[0m size \u001b[1;36m1967771\u001b[0m \u001b[2m \u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "b477d6652c9e4326b9d9ab5f651d24a8", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Map: 0%| | 0/2 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
mentionclasscuicanonical namelinked byscore
0CetuximabClinical_Drug409401002Product containing cetuximab (medicinal product)[ngram, sapbert]1.000000
1monoklonaler AntikörperClinical_Drug49616005Monoclonal antibody[ngram, sapbert]0.982318
2epidermalen WachstumsfaktorrezeptorNutrient_or_Body_Substance86960007Epidermal growth factor-urogastrone receptor[sapbert]0.937485
3EGFRNutrient_or_Body_Substance86960007Epidermal growth factor-urogastrone receptor[ngram, sapbert]1.000000
4Therapie des fortgeschrittenen kolorektalen Ka...Therapeutic1217692004Metastasis from malignant neoplasm of colon an...[ngram, sapbert]0.696480
5fortgeschrittenen kolorektalen KarzinomsDiagnosis_or_Pathology1217692004Metastasis from malignant neoplasm of colon an...[ngram, sapbert]0.843411
6IrinotecanClinical_Drug372538008Irinotecan[ngram, sapbert]1.000000
7FOLFOXTherapeutic699297004Ohdo syndrome, Maat-Kievit-Brunner type[sapbert]0.812118
8Versagen einer BehandlungOther_Finding7058009Noncompliance with treatment[ngram, sapbert]0.881669
9Behandlung mit Oxaliplatin und IrinotecanTherapeutic447053005Oxaliplatin desensitization therapy[ngram, sapbert]0.673975
10OxaliplatinClinical_Drug395814003Oxaliplatin[ngram, sapbert]1.000000
11IrinotecanClinical_Drug372538008Irinotecan[ngram, sapbert]1.000000
12HPV-DiagnostikDiagnostic700152009Human papilloma virus screening[sapbert]0.913146
13Plattenepithelkarzinom der MundhöhleDiagnosis_or_Pathology307502000Squamous cell carcinoma of mouth[ngram, sapbert]0.987126
\n", + "" + ], + "text/plain": [ + " mention \\\n", + "0 Cetuximab \n", + "1 monoklonaler Antikörper \n", + "2 epidermalen Wachstumsfaktorrezeptor \n", + "3 EGFR \n", + "4 Therapie des fortgeschrittenen kolorektalen Ka... \n", + "5 fortgeschrittenen kolorektalen Karzinoms \n", + "6 Irinotecan \n", + "7 FOLFOX \n", + "8 Versagen einer Behandlung \n", + "9 Behandlung mit Oxaliplatin und Irinotecan \n", + "10 Oxaliplatin \n", + "11 Irinotecan \n", + "12 HPV-Diagnostik \n", + "13 Plattenepithelkarzinom der Mundhöhle \n", + "\n", + " class cui \\\n", + "0 Clinical_Drug 409401002 \n", + "1 Clinical_Drug 49616005 \n", + "2 Nutrient_or_Body_Substance 86960007 \n", + "3 Nutrient_or_Body_Substance 86960007 \n", + "4 Therapeutic 1217692004 \n", + "5 Diagnosis_or_Pathology 1217692004 \n", + "6 Clinical_Drug 372538008 \n", + "7 Therapeutic 699297004 \n", + "8 Other_Finding 7058009 \n", + "9 Therapeutic 447053005 \n", + "10 Clinical_Drug 395814003 \n", + "11 Clinical_Drug 372538008 \n", + "12 Diagnostic 700152009 \n", + "13 Diagnosis_or_Pathology 307502000 \n", + "\n", + " canonical name linked by \\\n", + "0 Product containing cetuximab (medicinal product) [ngram, sapbert] \n", + "1 Monoclonal antibody [ngram, sapbert] \n", + "2 Epidermal growth factor-urogastrone receptor [sapbert] \n", + "3 Epidermal growth factor-urogastrone receptor [ngram, sapbert] \n", + "4 Metastasis from malignant neoplasm of colon an... [ngram, sapbert] \n", + "5 Metastasis from malignant neoplasm of colon an... [ngram, sapbert] \n", + "6 Irinotecan [ngram, sapbert] \n", + "7 Ohdo syndrome, Maat-Kievit-Brunner type [sapbert] \n", + "8 Noncompliance with treatment [ngram, sapbert] \n", + "9 Oxaliplatin desensitization therapy [ngram, sapbert] \n", + "10 Oxaliplatin [ngram, sapbert] \n", + "11 Irinotecan [ngram, sapbert] \n", + "12 Human papilloma virus screening [sapbert] \n", + "13 Squamous cell carcinoma of mouth [ngram, sapbert] \n", + "\n", + " score \n", + "0 1.000000 \n", + "1 0.982318 \n", + "2 0.937485 \n", + "3 1.000000 \n", + "4 0.696480 \n", + "5 0.843411 \n", + "6 1.000000 \n", + "7 0.812118 \n", + "8 0.881669 \n", + "9 0.673975 \n", + "10 1.000000 \n", + "11 1.000000 \n", + "12 0.913146 \n", + "13 0.987126 " + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from util import get_dataframe\n", + "get_dataframe(candidates, kb)" + ] + }, + { + "cell_type": "markdown", + "id": "5b8847b5-ff8f-40de-8a18-f2c2485daf7e", + "metadata": {}, + "source": [ + "## Re-Ranking" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "8185c65f-5645-4c38-a11a-a32a4f24a2a1", + "metadata": {}, + "outputs": [], + "source": [ + "from xmen.reranking import CrossEncoderReranker" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "07d41590-f991-474a-9109-9b75d3184266", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Context length: 128\n", + "Use NIL values: True\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "f48fee7d065841388efc407f3aeaeac5", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Map: 0%| | 0/2 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
mentionclasscuicanonical namelinked byscore
0CetuximabClinical_Drug409400001Cetuximab[ngram, sapbert]0.040032
1monoklonaler AntikörperClinical_Drug49616005Monoclonal antibody[ngram, sapbert]0.043237
2epidermalen WachstumsfaktorrezeptorNutrient_or_Body_Substance86960007Epidermal growth factor-urogastrone receptor[sapbert]0.022354
3EGFRNutrient_or_Body_Substance86960007Epidermal growth factor-urogastrone receptor[ngram, sapbert]0.033175
4Therapie des fortgeschrittenen kolorektalen Ka...Therapeutic1217692004Metastasis from malignant neoplasm of colon an...[ngram, sapbert]0.017036
5fortgeschrittenen kolorektalen KarzinomsDiagnosis_or_Pathology126837005Tumor of large intestine[ngram, sapbert]0.017528
6IrinotecanClinical_Drug372538008Irinotecan[ngram, sapbert]0.042663
7FOLFOXTherapeutic461391000124102Folfox protocol[ngram, sapbert]0.018860
8Versagen einer BehandlungOther_Finding266721009Absent response to treatment[sapbert]0.017751
9Behandlung mit Oxaliplatin und IrinotecanTherapeutic447053005Oxaliplatin desensitization therapy[ngram, sapbert]0.017154
10OxaliplatinClinical_Drug395814003Oxaliplatin[ngram, sapbert]0.039662
11IrinotecanClinical_Drug372538008Irinotecan[ngram, sapbert]0.045797
12HPV-DiagnostikDiagnostic700152009Human papilloma virus screening[sapbert]0.019670
13Plattenepithelkarzinom der MundhöhleDiagnosis_or_Pathology307502000Squamous cell carcinoma of mouth[ngram, sapbert]0.037824
\n", + "" + ], + "text/plain": [ + " mention \\\n", + "0 Cetuximab \n", + "1 monoklonaler Antikörper \n", + "2 epidermalen Wachstumsfaktorrezeptor \n", + "3 EGFR \n", + "4 Therapie des fortgeschrittenen kolorektalen Ka... \n", + "5 fortgeschrittenen kolorektalen Karzinoms \n", + "6 Irinotecan \n", + "7 FOLFOX \n", + "8 Versagen einer Behandlung \n", + "9 Behandlung mit Oxaliplatin und Irinotecan \n", + "10 Oxaliplatin \n", + "11 Irinotecan \n", + "12 HPV-Diagnostik \n", + "13 Plattenepithelkarzinom der Mundhöhle \n", + "\n", + " class cui \\\n", + "0 Clinical_Drug 409400001 \n", + "1 Clinical_Drug 49616005 \n", + "2 Nutrient_or_Body_Substance 86960007 \n", + "3 Nutrient_or_Body_Substance 86960007 \n", + "4 Therapeutic 1217692004 \n", + "5 Diagnosis_or_Pathology 126837005 \n", + "6 Clinical_Drug 372538008 \n", + "7 Therapeutic 461391000124102 \n", + "8 Other_Finding 266721009 \n", + "9 Therapeutic 447053005 \n", + "10 Clinical_Drug 395814003 \n", + "11 Clinical_Drug 372538008 \n", + "12 Diagnostic 700152009 \n", + "13 Diagnosis_or_Pathology 307502000 \n", + "\n", + " canonical name linked by \\\n", + "0 Cetuximab [ngram, sapbert] \n", + "1 Monoclonal antibody [ngram, sapbert] \n", + "2 Epidermal growth factor-urogastrone receptor [sapbert] \n", + "3 Epidermal growth factor-urogastrone receptor [ngram, sapbert] \n", + "4 Metastasis from malignant neoplasm of colon an... [ngram, sapbert] \n", + "5 Tumor of large intestine [ngram, sapbert] \n", + "6 Irinotecan [ngram, sapbert] \n", + "7 Folfox protocol [ngram, sapbert] \n", + "8 Absent response to treatment [sapbert] \n", + "9 Oxaliplatin desensitization therapy [ngram, sapbert] \n", + "10 Oxaliplatin [ngram, sapbert] \n", + "11 Irinotecan [ngram, sapbert] \n", + "12 Human papilloma virus screening [sapbert] \n", + "13 Squamous cell carcinoma of mouth [ngram, sapbert] \n", + "\n", + " score \n", + "0 0.040032 \n", + "1 0.043237 \n", + "2 0.022354 \n", + "3 0.033175 \n", + "4 0.017036 \n", + "5 0.017528 \n", + "6 0.042663 \n", + "7 0.018860 \n", + "8 0.017751 \n", + "9 0.017154 \n", + "10 0.039662 \n", + "11 0.045797 \n", + "12 0.019670 \n", + "13 0.037824 " + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# After Re-ranking\n", + "get_dataframe(reranked, kb)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ace9bf0-c0ef-4c9e-9bd6-6ecce3cf03b6", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "070c2a21-fe32-4b59-a8ac-1b97d9548e76", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:xmen_notebooks]", + "language": "python", + "name": "conda-env-xmen_notebooks-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/README.md b/examples/README.md new file mode 100644 index 0000000..d8f8a05 --- /dev/null +++ b/examples/README.md @@ -0,0 +1,13 @@ +# Examples + +|Link|Description| +|---|---| +|[01_BRONCO_German.ipynb](01_BRONCO_German.ipynb)|Candidate generation and supervised re-ranking using the BRONCO corpus.
Shows how you can configure multiple dictionaries in the same config file.| +|[02_GGPONC_German.ipynb](02_GGPONC_German.ipynb)|Using a spaCy NER model with xMEN
Shows how to build a pipeline without labelled data using candidate generation, type filtering and pre-trained re-rankers| +|[03_SNOMED_Linking_German.ipynb](03_SNOMED_Linking_German.ipynb)|Linking against codes in UMLS source vocabularies (here SNOMED CT)| +| | | + + +## Benchmarks + +More examples for configurations can be found in the [Benchmarks](../benchmarks) folder. \ No newline at end of file diff --git a/examples/Temp.ipynb b/examples/Temp.ipynb deleted file mode 100644 index 40b541c..0000000 --- a/examples/Temp.ipynb +++ /dev/null @@ -1,216 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "from xmen import evaluation\n", - "from xmen.data import make_document, Entity, Concept" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "gt = [\n", - " make_document(\n", - " [\n", - " Entity([[11, 17]], \"entity\", concepts=[\n", - " Concept(\"c1\", db_name=\"UMLS\"),\n", - " ]),\n", - " ]\n", - " )\n", - "]\n", - "pred = [\n", - " make_document(\n", - " [\n", - " Entity([[11, 17]], \"entity\", concepts=[\n", - " Concept(\"c1\", db_name=\"UMLS\")\n", - " ]),\n", - " Entity([[11, 15]], \"entity\", concepts=[\n", - " Concept(\"c1\", db_name=\"UMLS\")\n", - " ]),\n", - " Entity([[16, 17]], \"entity\", concepts=[\n", - " Concept(\"c2\", db_name=\"UMLS\")\n", - " ]),\n", - " ]\n", - " )\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
_word_len_abbrevgt_startgt_endgt_textpred_startpred_endpred_textner_match_typegold_conceptgold_typepred_indexpred_index_scorepred_toppred_top_scorecorpus_iddocument_id
01.0False11.017.0[entity]1115[entity]be{'db_id': 'c1', 'target_kb': 'UMLS', 'type': N...None0Nonec1Nonex1
11.0False11.017.0[entity]1117[entity]be{'db_id': 'c1', 'target_kb': 'UMLS', 'type': N...None0Nonec1Nonex1
2NaNNoneNaNNaNNone1617[entity]fpNoneNone-1NoneNoneNonex1
\n", - "
" - ], - "text/plain": [ - " _word_len _abbrev gt_start gt_end gt_text pred_start pred_end \\\n", - "0 1.0 False 11.0 17.0 [entity] 11 15 \n", - "1 1.0 False 11.0 17.0 [entity] 11 17 \n", - "2 NaN None NaN NaN None 16 17 \n", - "\n", - " pred_text ner_match_type gold_concept \\\n", - "0 [entity] be {'db_id': 'c1', 'target_kb': 'UMLS', 'type': N... \n", - "1 [entity] be {'db_id': 'c1', 'target_kb': 'UMLS', 'type': N... \n", - "2 [entity] fp None \n", - "\n", - " gold_type pred_index pred_index_score pred_top pred_top_score corpus_id \\\n", - "0 None 0 None c1 None x \n", - "1 None 0 None c1 None x \n", - "2 None -1 None None None x \n", - "\n", - " document_id \n", - "0 1 \n", - "1 1 \n", - "2 1 " - ] - }, - "execution_count": 37, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from xmen.evaluation import error_analysis\n", - "error_analysis(gt, pred)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "xmen_notebooks", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.11" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/examples/conf/meddra_german.yaml b/examples/conf/meddra_german.yaml index c20292a..552246f 100644 --- a/examples/conf/meddra_german.yaml +++ b/examples/conf/meddra_german.yaml @@ -3,6 +3,12 @@ name: meddra_german dict: custom: umls_meta_path: ${oc.env:UMLS_HOME}/2023AA/META + id_key: SDUI sabs: - MDR - - MDRGER \ No newline at end of file + - MDRGER + # Un-comment to add aliases from other UMLS vocabularies and languages + #umls_extend: + # lang: + # - de + # - en \ No newline at end of file diff --git a/examples/conf/snomed_german.yaml b/examples/conf/snomed_german.yaml new file mode 100644 index 0000000..e31d025 --- /dev/null +++ b/examples/conf/snomed_german.yaml @@ -0,0 +1,24 @@ +name: snomed_german + +cache_dir: ${oc.env:HOME}/.cache/xmen/ + +dict: + custom: + umls_meta_path: ${oc.env:UMLS_HOME}/2023AA/META + id_key: SCUI + sabs: + - SNOMEDCT_US + umls_extend: + lang: + - de + - en + +linker: + candidate_generation: + k: 64 + ngram: + index_base_path: ${cache_dir}/${name}/index/ngrams/ + k: 3 + sapbert: + index_base_path: ${cache_dir}/${name}/index/sapbert + k: 3 \ No newline at end of file diff --git a/examples/dicts/umls_source.py b/examples/dicts/umls_source.py index 3755741..a32222d 100644 --- a/examples/dicts/umls_source.py +++ b/examples/dicts/umls_source.py @@ -1,25 +1,54 @@ from tqdm.auto import tqdm -from xmen.umls import read_umls_file_headers - +from xmen.umls import read_umls_file_headers, get_umls_concepts +from collections import defaultdict +from xmen.log import logger def get_concept_details(cfg): mrconso = "MRCONSO.RRF" concept_details = {} meta_path = cfg.dict.custom.umls_meta_path sabs = cfg.dict.custom.sabs + id_key = cfg.dict.custom.get("id_key") headers = read_umls_file_headers(meta_path, mrconso) + scui2cui = defaultdict(list) + with open(f"{meta_path}/{mrconso}") as fin: for line in tqdm(fin.readlines()): splits = line.strip().split("|") assert len(headers) == len(splits) concept = dict(zip(headers, splits)) if concept["SAB"] in sabs: - sid = concept["SDUI"] + cui = concept["CUI"] + if id_key: + sid = concept[id_key] + else: + sid = concept["SDUI"] + if not sid: + sid = concept["SCUI"] + if not sid: + logger.warn(f"Skipping concept with CUI {cui} because we could not find a valid source vocabulary ID") name = concept["STR"] if sid in concept_details: concept_details[sid]["aliases"].append(name) else: concept_details[sid] = {"concept_id": sid, "canonical_name": name, "types": [], "aliases": []} + scui2cui[sid].append(cui) + + if umls_extend := cfg.dict.custom.get('umls_extend'): + # Optionally extend with UMLS synonyms + other_umls_concepts = get_umls_concepts(meta_path, + umls_extend.get("lang"), sabs=umls_extend.get("sabs"), sources=umls_extend.get("sources"), semantic_groups=umls_extend.get("semantic_groups"), semantic_types=umls_extend.get("semantic_types")) + + for scui, concept in tqdm(concept_details.items()): + for cui in scui2cui[scui]: + if cui in other_umls_concepts: + for t in other_umls_concepts[cui]['types']: + if not t in concept["types"]: + concept["types"].append(t) + for new_alias in other_umls_concepts[cui]["aliases"] + [other_umls_concepts[cui]["canonical_name"]]: + if new_alias not in concept["aliases"] and new_alias != concept["canonical_name"]: + concept["aliases"].append(new_alias) + return concept_details diff --git a/examples/ggponc2tui.csv b/examples/ggponc2tui.csv deleted file mode 100644 index 2264706..0000000 --- a/examples/ggponc2tui.csv +++ /dev/null @@ -1,128 +0,0 @@ -Group,Group Long,TUI,Type Name,Diagnosis_or_Pathology,Other_Finding,Clinical_Drug,Nutrient_or_Body_Substance,External_Substance,Therapeutic,Diagnostic -ACTI,Activities & Behaviors,T052,Activity,x,x,,,,x,x -ACTI,Activities & Behaviors,T053,Behavior,x,x,,,,, -ACTI,Activities & Behaviors,T056,Daily or Recreational Activity,x,x,,,,, -ACTI,Activities & Behaviors,T051,Event,x,x,,,,, -ACTI,Activities & Behaviors,T064,Governmental or Regulatory Activity,x,x,,,,x,x -ACTI,Activities & Behaviors,T055,Individual Behavior,x,x,,,,, -ACTI,Activities & Behaviors,T066,Machine Activity,x,x,,,,x,x -ACTI,Activities & Behaviors,T057,Occupational Activity,x,x,,,,x,x -ACTI,Activities & Behaviors,T054,Social Behavior,x,x,,,,, -ANAT,Anatomy,T017,Anatomical Structure,x,x,,x,,, -ANAT,Anatomy,T029,Body Location or Region,x,x,,x,,, -ANAT,Anatomy,T023,"Body Part, Organ, or Organ Component",x,x,,x,,, -ANAT,Anatomy,T030,Body Space or Junction,x,x,,x,,, -ANAT,Anatomy,T031,Body Substance,x,x,,x,,, -ANAT,Anatomy,T022,Body System,x,x,,x,,, -ANAT,Anatomy,T025,Cell,x,x,,x,,, -ANAT,Anatomy,T026,Cell Component,x,x,,x,,, -ANAT,Anatomy,T018,Embryonic Structure,x,x,,x,,, -ANAT,Anatomy,T021,Fully Formed Anatomical Structure,x,x,,x,,, -ANAT,Anatomy,T024,Tissue,x,x,,x,,, -CHEM,Chemicals & Drugs,T116,"Amino Acid, Peptide, or Protein",,,x,x,x,x,x -CHEM,Chemicals & Drugs,T195,Antibiotic,,,x,x,x,x,x -CHEM,Chemicals & Drugs,T123,Biologically Active Substance,x,x,x,x,x,x,x -CHEM,Chemicals & Drugs,T122,Biomedical or Dental Material,x,x,x,x,x,x,x -CHEM,Chemicals & Drugs,T103,Chemical,x,x,x,x,x,x,x -CHEM,Chemicals & Drugs,T120,Chemical Viewed Functionally,x,x,x,x,x,x,x -CHEM,Chemicals & Drugs,T104,Chemical Viewed Structurally,x,x,x,x,x,x,x -CHEM,Chemicals & Drugs,T200,Clinical Drug,x,x,x,x,x,x,x -CHEM,Chemicals & Drugs,T196,"Element, Ion, or Isotope",x,x,x,x,x,x,x -CHEM,Chemicals & Drugs,T126,Enzyme,x,x,x,x,x,x,x -CHEM,Chemicals & Drugs,T131,Hazardous or Poisonous Substance,x,x,x,x,x,x,x -CHEM,Chemicals & Drugs,T125,Hormone,x,x,x,x,x,x,x -CHEM,Chemicals & Drugs,T129,Immunologic Factor,x,x,x,x,x,x,x -CHEM,Chemicals & Drugs,T130,"Indicator, Reagent, or Diagnostic Aid",x,x,x,x,x,x,x -CHEM,Chemicals & Drugs,T197,Inorganic Chemical,x,x,x,x,x,x,x -CHEM,Chemicals & Drugs,T114,"Nucleic Acid, Nucleoside, or Nucleotide",x,x,x,x,x,x,x -CHEM,Chemicals & Drugs,T109,Organic Chemical,x,x,x,x,x,x,x -CHEM,Chemicals & Drugs,T121,Pharmacologic Substance,x,x,x,x,x,x,x -CHEM,Chemicals & Drugs,T192,Receptor,x,x,x,x,x,x,x -CHEM,Chemicals & Drugs,T127,Vitamin,x,x,x,x,x,x,x -CONC,Concepts & Ideas,T185,Classification,,x,,,,, -CONC,Concepts & Ideas,T077,Conceptual Entity,,x,,,,, -CONC,Concepts & Ideas,T169,Functional Concept,,x,,,,, -CONC,Concepts & Ideas,T102,Group Attribute,,x,,,,, -CONC,Concepts & Ideas,T078,Idea or Concept,,x,,,,, -CONC,Concepts & Ideas,T170,Intellectual Product,,,,,,, -CONC,Concepts & Ideas,T171,Language,,x,,,,, -CONC,Concepts & Ideas,T080,Qualitative Concept,,x,,,,, -CONC,Concepts & Ideas,T081,Quantitative Concept,,x,,,,, -CONC,Concepts & Ideas,T089,Regulation or Law,,,,,,, -CONC,Concepts & Ideas,T082,Spatial Concept,,x,,,,, -CONC,Concepts & Ideas,T079,Temporal Concept,,x,,,,, -DEVI,Devices,T203,Drug Delivery Device,,,x,,,x,x -DEVI,Devices,T074,Medical Device,,,x,,,x,x -DEVI,Devices,T075,Research Device,,,x,,,x,x -DISO,Disorders,T020,Acquired Abnormality,x,x,,,,, -DISO,Disorders,T190,Anatomical Abnormality,x,x,,,,, -DISO,Disorders,T049,Cell or Molecular Dysfunction,x,x,,,,, -DISO,Disorders,T019,Congenital Abnormality,x,x,,,,, -DISO,Disorders,T047,Disease or Syndrome,x,x,,,,, -DISO,Disorders,T050,Experimental Model of Disease,x,x,,,,, -DISO,Disorders,T033,Finding,x,x,,,,, -DISO,Disorders,T037,Injury or Poisoning,x,x,,,x,, -DISO,Disorders,T048,Mental or Behavioral Dysfunction,x,x,,,,, -DISO,Disorders,T191,Neoplastic Process,x,x,,,,, -DISO,Disorders,T046,Pathologic Function,x,x,,,,, -DISO,Disorders,T184,Sign or Symptom,x,x,,,,, -GENE,Genes & Molecular Sequences,T087,Amino Acid Sequence,x,x,x,x,x,, -GENE,Genes & Molecular Sequences,T088,Carbohydrate Sequence,x,x,x,x,x,, -GENE,Genes & Molecular Sequences,T028,Gene or Genome,x,x,x,x,x,, -GENE,Genes & Molecular Sequences,T085,Molecular Sequence,x,x,x,x,x,, -GENE,Genes & Molecular Sequences,T086,Nucleotide Sequence,x,x,x,x,x,, -GEOG,Geographic Areas,T083,Geographic Area,x,x,x,x,x,, -LIVB,Living Beings,T100,Age Group,,x,,,,, -LIVB,Living Beings,T011,Amphibian,,x,,,,, -LIVB,Living Beings,T008,Animal,,x,,,,, -LIVB,Living Beings,T194,Archaeon,,x,,,,, -LIVB,Living Beings,T007,Bacterium,x,x,,x,,, -LIVB,Living Beings,T012,Bird,,x,,,,, -LIVB,Living Beings,T204,Eukaryote,x,x,,x,,, -LIVB,Living Beings,T099,Family Group,,x,,,,, -LIVB,Living Beings,T013,Fish,,x,,,,, -LIVB,Living Beings,T004,Fungus,x,x,,x,,, -LIVB,Living Beings,T096,Group,,x,,,,, -LIVB,Living Beings,T016,Human,,x,,,,, -LIVB,Living Beings,T015,Mammal,,x,,,,, -LIVB,Living Beings,T001,Organism,,x,,,,, -LIVB,Living Beings,T101,Patient or Disabled Group,x,x,,,,, -LIVB,Living Beings,T002,Plant,,x,,,,, -LIVB,Living Beings,T098,Population Group,,x,,,,, -LIVB,Living Beings,T097,Professional or Occupational Group,,x,,,,x,x -LIVB,Living Beings,T014,Reptile,,x,,,,, -LIVB,Living Beings,T010,Vertebrate,,x,,,,, -LIVB,Living Beings,T005,Virus,x,x,,,,, -OBJC,Objects,T071,Entity,x,x,x,x,x,x,x -OBJC,Objects,T168,Food,x,x,x,x,x,x,x -OBJC,Objects,T073,Manufactured Object,x,x,x,x,x,x,x -OBJC,Objects,T072,Physical Object,x,x,x,x,x,x,x -OBJC,Objects,T167,Substance,x,x,x,x,x,x,x -OCCU,Occupations,T091,Biomedical Occupation or Discipline,x,x,,,,x,x -OCCU,Occupations,T090,Occupation or Discipline,x,x,,,,x,x -ORGA,Organizations,T093,Health Care Related Organization,,x,,,,x,x -ORGA,Organizations,T092,Organization,,x,,,,x,x -ORGA,Organizations,T094,Professional Society,,x,,,,x,x -ORGA,Organizations,T095,Self-help or Relief Organization,,x,,,,x,x -PHEN,Phenomena,T038,Biologic Function,x,x,,,,x,x -PHEN,Phenomena,T069,Environmental Effect of Humans,x,x,,,,x,x -PHEN,Phenomena,T068,Human-caused Phenomenon or Process,x,x,,,,x,x -PHEN,Phenomena,T034,Laboratory or Test Result,x,x,,,,x,x -PHEN,Phenomena,T070,Natural Phenomenon or Process,x,x,,,,x,x -PHEN,Phenomena,T067,Phenomenon or Process,x,x,,,,x,x -PHYS,Physiology,T043,Cell Function,x,x,,x,,, -PHYS,Physiology,T201,Clinical Attribute,x,x,,,,, -PHYS,Physiology,T045,Genetic Function,x,x,,x,,, -PHYS,Physiology,T041,Mental Process,x,x,,,,, -PHYS,Physiology,T044,Molecular Function,x,x,,x,,, -PHYS,Physiology,T032,Organism Attribute,x,x,,,,, -PHYS,Physiology,T040,Organism Function,x,x,,,,, -PHYS,Physiology,T042,Organ or Tissue Function,x,x,,,,, -PHYS,Physiology,T039,Physiologic Function,x,x,,,,, -PROC,Procedures,T060,Diagnostic Procedure,,,,,,x,x -PROC,Procedures,T065,Educational Activity,,,,,,x, -PROC,Procedures,T058,Health Care Activity,,,,,,x,x -PROC,Procedures,T059,Laboratory Procedure,,,,,,x,x -PROC,Procedures,T063,Molecular Biology Research Technique,,,,,,x,x -PROC,Procedures,T062,Research Activity,,,,,,x,x -PROC,Procedures,T061,Therapeutic or Preventive Procedure,,,,,,x,x \ No newline at end of file diff --git a/examples/ggponc_tuis.csv b/examples/ggponc_tuis.csv new file mode 100644 index 0000000..2ee4ab5 --- /dev/null +++ b/examples/ggponc_tuis.csv @@ -0,0 +1,414 @@ +class,tui +Diagnosis_or_Pathology,T052 +Diagnosis_or_Pathology,T053 +Diagnosis_or_Pathology,T056 +Diagnosis_or_Pathology,T051 +Diagnosis_or_Pathology,T064 +Diagnosis_or_Pathology,T055 +Diagnosis_or_Pathology,T066 +Diagnosis_or_Pathology,T057 +Diagnosis_or_Pathology,T054 +Diagnosis_or_Pathology,T017 +Diagnosis_or_Pathology,T029 +Diagnosis_or_Pathology,T023 +Diagnosis_or_Pathology,T030 +Diagnosis_or_Pathology,T031 +Diagnosis_or_Pathology,T022 +Diagnosis_or_Pathology,T025 +Diagnosis_or_Pathology,T026 +Diagnosis_or_Pathology,T018 +Diagnosis_or_Pathology,T021 +Diagnosis_or_Pathology,T024 +Diagnosis_or_Pathology,T123 +Diagnosis_or_Pathology,T122 +Diagnosis_or_Pathology,T103 +Diagnosis_or_Pathology,T120 +Diagnosis_or_Pathology,T104 +Diagnosis_or_Pathology,T200 +Diagnosis_or_Pathology,T196 +Diagnosis_or_Pathology,T126 +Diagnosis_or_Pathology,T131 +Diagnosis_or_Pathology,T125 +Diagnosis_or_Pathology,T129 +Diagnosis_or_Pathology,T130 +Diagnosis_or_Pathology,T197 +Diagnosis_or_Pathology,T114 +Diagnosis_or_Pathology,T109 +Diagnosis_or_Pathology,T121 +Diagnosis_or_Pathology,T192 +Diagnosis_or_Pathology,T127 +Diagnosis_or_Pathology,T020 +Diagnosis_or_Pathology,T190 +Diagnosis_or_Pathology,T049 +Diagnosis_or_Pathology,T019 +Diagnosis_or_Pathology,T047 +Diagnosis_or_Pathology,T050 +Diagnosis_or_Pathology,T033 +Diagnosis_or_Pathology,T037 +Diagnosis_or_Pathology,T048 +Diagnosis_or_Pathology,T191 +Diagnosis_or_Pathology,T046 +Diagnosis_or_Pathology,T184 +Diagnosis_or_Pathology,T087 +Diagnosis_or_Pathology,T088 +Diagnosis_or_Pathology,T028 +Diagnosis_or_Pathology,T085 +Diagnosis_or_Pathology,T086 +Diagnosis_or_Pathology,T083 +Diagnosis_or_Pathology,T007 +Diagnosis_or_Pathology,T204 +Diagnosis_or_Pathology,T004 +Diagnosis_or_Pathology,T101 +Diagnosis_or_Pathology,T005 +Diagnosis_or_Pathology,T071 +Diagnosis_or_Pathology,T168 +Diagnosis_or_Pathology,T073 +Diagnosis_or_Pathology,T072 +Diagnosis_or_Pathology,T167 +Diagnosis_or_Pathology,T091 +Diagnosis_or_Pathology,T090 +Diagnosis_or_Pathology,T038 +Diagnosis_or_Pathology,T069 +Diagnosis_or_Pathology,T068 +Diagnosis_or_Pathology,T034 +Diagnosis_or_Pathology,T070 +Diagnosis_or_Pathology,T067 +Diagnosis_or_Pathology,T043 +Diagnosis_or_Pathology,T201 +Diagnosis_or_Pathology,T045 +Diagnosis_or_Pathology,T041 +Diagnosis_or_Pathology,T044 +Diagnosis_or_Pathology,T032 +Diagnosis_or_Pathology,T040 +Diagnosis_or_Pathology,T042 +Diagnosis_or_Pathology,T039 +Other_Finding,T052 +Other_Finding,T053 +Other_Finding,T056 +Other_Finding,T051 +Other_Finding,T064 +Other_Finding,T055 +Other_Finding,T066 +Other_Finding,T057 +Other_Finding,T054 +Other_Finding,T017 +Other_Finding,T029 +Other_Finding,T023 +Other_Finding,T030 +Other_Finding,T031 +Other_Finding,T022 +Other_Finding,T025 +Other_Finding,T026 +Other_Finding,T018 +Other_Finding,T021 +Other_Finding,T024 +Other_Finding,T123 +Other_Finding,T122 +Other_Finding,T103 +Other_Finding,T120 +Other_Finding,T104 +Other_Finding,T200 +Other_Finding,T196 +Other_Finding,T126 +Other_Finding,T131 +Other_Finding,T125 +Other_Finding,T129 +Other_Finding,T130 +Other_Finding,T197 +Other_Finding,T114 +Other_Finding,T109 +Other_Finding,T121 +Other_Finding,T192 +Other_Finding,T127 +Other_Finding,T185 +Other_Finding,T077 +Other_Finding,T169 +Other_Finding,T102 +Other_Finding,T078 +Other_Finding,T171 +Other_Finding,T080 +Other_Finding,T081 +Other_Finding,T082 +Other_Finding,T079 +Other_Finding,T020 +Other_Finding,T190 +Other_Finding,T049 +Other_Finding,T019 +Other_Finding,T047 +Other_Finding,T050 +Other_Finding,T033 +Other_Finding,T037 +Other_Finding,T048 +Other_Finding,T191 +Other_Finding,T046 +Other_Finding,T184 +Other_Finding,T087 +Other_Finding,T088 +Other_Finding,T028 +Other_Finding,T085 +Other_Finding,T086 +Other_Finding,T083 +Other_Finding,T100 +Other_Finding,T011 +Other_Finding,T008 +Other_Finding,T194 +Other_Finding,T007 +Other_Finding,T012 +Other_Finding,T204 +Other_Finding,T099 +Other_Finding,T013 +Other_Finding,T004 +Other_Finding,T096 +Other_Finding,T016 +Other_Finding,T015 +Other_Finding,T001 +Other_Finding,T101 +Other_Finding,T002 +Other_Finding,T098 +Other_Finding,T097 +Other_Finding,T014 +Other_Finding,T010 +Other_Finding,T005 +Other_Finding,T071 +Other_Finding,T168 +Other_Finding,T073 +Other_Finding,T072 +Other_Finding,T167 +Other_Finding,T091 +Other_Finding,T090 +Other_Finding,T093 +Other_Finding,T092 +Other_Finding,T094 +Other_Finding,T095 +Other_Finding,T038 +Other_Finding,T069 +Other_Finding,T068 +Other_Finding,T034 +Other_Finding,T070 +Other_Finding,T067 +Other_Finding,T043 +Other_Finding,T201 +Other_Finding,T045 +Other_Finding,T041 +Other_Finding,T044 +Other_Finding,T032 +Other_Finding,T040 +Other_Finding,T042 +Other_Finding,T039 +Clinical_Drug,T116 +Clinical_Drug,T195 +Clinical_Drug,T123 +Clinical_Drug,T122 +Clinical_Drug,T103 +Clinical_Drug,T120 +Clinical_Drug,T104 +Clinical_Drug,T200 +Clinical_Drug,T196 +Clinical_Drug,T126 +Clinical_Drug,T131 +Clinical_Drug,T125 +Clinical_Drug,T129 +Clinical_Drug,T130 +Clinical_Drug,T197 +Clinical_Drug,T114 +Clinical_Drug,T109 +Clinical_Drug,T121 +Clinical_Drug,T192 +Clinical_Drug,T127 +Clinical_Drug,T203 +Clinical_Drug,T074 +Clinical_Drug,T075 +Clinical_Drug,T087 +Clinical_Drug,T088 +Clinical_Drug,T028 +Clinical_Drug,T085 +Clinical_Drug,T086 +Clinical_Drug,T083 +Clinical_Drug,T071 +Clinical_Drug,T168 +Clinical_Drug,T073 +Clinical_Drug,T072 +Clinical_Drug,T167 +Nutrient_or_Body_Substance,T017 +Nutrient_or_Body_Substance,T029 +Nutrient_or_Body_Substance,T023 +Nutrient_or_Body_Substance,T030 +Nutrient_or_Body_Substance,T031 +Nutrient_or_Body_Substance,T022 +Nutrient_or_Body_Substance,T025 +Nutrient_or_Body_Substance,T026 +Nutrient_or_Body_Substance,T018 +Nutrient_or_Body_Substance,T021 +Nutrient_or_Body_Substance,T024 +Nutrient_or_Body_Substance,T116 +Nutrient_or_Body_Substance,T195 +Nutrient_or_Body_Substance,T123 +Nutrient_or_Body_Substance,T122 +Nutrient_or_Body_Substance,T103 +Nutrient_or_Body_Substance,T120 +Nutrient_or_Body_Substance,T104 +Nutrient_or_Body_Substance,T200 +Nutrient_or_Body_Substance,T196 +Nutrient_or_Body_Substance,T126 +Nutrient_or_Body_Substance,T131 +Nutrient_or_Body_Substance,T125 +Nutrient_or_Body_Substance,T129 +Nutrient_or_Body_Substance,T130 +Nutrient_or_Body_Substance,T197 +Nutrient_or_Body_Substance,T114 +Nutrient_or_Body_Substance,T109 +Nutrient_or_Body_Substance,T121 +Nutrient_or_Body_Substance,T192 +Nutrient_or_Body_Substance,T127 +Nutrient_or_Body_Substance,T087 +Nutrient_or_Body_Substance,T088 +Nutrient_or_Body_Substance,T028 +Nutrient_or_Body_Substance,T085 +Nutrient_or_Body_Substance,T086 +Nutrient_or_Body_Substance,T083 +Nutrient_or_Body_Substance,T007 +Nutrient_or_Body_Substance,T204 +Nutrient_or_Body_Substance,T004 +Nutrient_or_Body_Substance,T071 +Nutrient_or_Body_Substance,T168 +Nutrient_or_Body_Substance,T073 +Nutrient_or_Body_Substance,T072 +Nutrient_or_Body_Substance,T167 +Nutrient_or_Body_Substance,T043 +Nutrient_or_Body_Substance,T045 +Nutrient_or_Body_Substance,T044 +External_Substance,T116 +External_Substance,T195 +External_Substance,T123 +External_Substance,T122 +External_Substance,T103 +External_Substance,T120 +External_Substance,T104 +External_Substance,T200 +External_Substance,T196 +External_Substance,T126 +External_Substance,T131 +External_Substance,T125 +External_Substance,T129 +External_Substance,T130 +External_Substance,T197 +External_Substance,T114 +External_Substance,T109 +External_Substance,T121 +External_Substance,T192 +External_Substance,T127 +External_Substance,T037 +External_Substance,T087 +External_Substance,T088 +External_Substance,T028 +External_Substance,T085 +External_Substance,T086 +External_Substance,T083 +External_Substance,T071 +External_Substance,T168 +External_Substance,T073 +External_Substance,T072 +External_Substance,T167 +Therapeutic,T052 +Therapeutic,T064 +Therapeutic,T066 +Therapeutic,T057 +Therapeutic,T116 +Therapeutic,T195 +Therapeutic,T123 +Therapeutic,T122 +Therapeutic,T103 +Therapeutic,T120 +Therapeutic,T104 +Therapeutic,T200 +Therapeutic,T196 +Therapeutic,T126 +Therapeutic,T131 +Therapeutic,T125 +Therapeutic,T129 +Therapeutic,T130 +Therapeutic,T197 +Therapeutic,T114 +Therapeutic,T109 +Therapeutic,T121 +Therapeutic,T192 +Therapeutic,T127 +Therapeutic,T203 +Therapeutic,T074 +Therapeutic,T075 +Therapeutic,T097 +Therapeutic,T071 +Therapeutic,T168 +Therapeutic,T073 +Therapeutic,T072 +Therapeutic,T167 +Therapeutic,T091 +Therapeutic,T090 +Therapeutic,T093 +Therapeutic,T092 +Therapeutic,T094 +Therapeutic,T095 +Therapeutic,T038 +Therapeutic,T069 +Therapeutic,T068 +Therapeutic,T034 +Therapeutic,T070 +Therapeutic,T067 +Therapeutic,T060 +Therapeutic,T065 +Therapeutic,T058 +Therapeutic,T059 +Therapeutic,T063 +Therapeutic,T062 +Therapeutic,T061 +Diagnostic,T052 +Diagnostic,T064 +Diagnostic,T066 +Diagnostic,T057 +Diagnostic,T116 +Diagnostic,T195 +Diagnostic,T123 +Diagnostic,T122 +Diagnostic,T103 +Diagnostic,T120 +Diagnostic,T104 +Diagnostic,T200 +Diagnostic,T196 +Diagnostic,T126 +Diagnostic,T131 +Diagnostic,T125 +Diagnostic,T129 +Diagnostic,T130 +Diagnostic,T197 +Diagnostic,T114 +Diagnostic,T109 +Diagnostic,T121 +Diagnostic,T192 +Diagnostic,T127 +Diagnostic,T203 +Diagnostic,T074 +Diagnostic,T075 +Diagnostic,T097 +Diagnostic,T071 +Diagnostic,T168 +Diagnostic,T073 +Diagnostic,T072 +Diagnostic,T167 +Diagnostic,T091 +Diagnostic,T090 +Diagnostic,T093 +Diagnostic,T092 +Diagnostic,T094 +Diagnostic,T095 +Diagnostic,T038 +Diagnostic,T069 +Diagnostic,T068 +Diagnostic,T034 +Diagnostic,T070 +Diagnostic,T067 +Diagnostic,T060 +Diagnostic,T058 +Diagnostic,T059 +Diagnostic,T063 +Diagnostic,T062 +Diagnostic,T061 diff --git a/examples/conf/distemist.yaml b/examples/old_examples/conf/distemist.yaml similarity index 100% rename from examples/conf/distemist.yaml rename to examples/old_examples/conf/distemist.yaml diff --git a/examples/01_BioASQ_DisTEMIST.ipynb b/examples/old_examples/distemist/01_BioASQ_DisTEMIST.ipynb similarity index 100% rename from examples/01_BioASQ_DisTEMIST.ipynb rename to examples/old_examples/distemist/01_BioASQ_DisTEMIST.ipynb diff --git a/examples/dicts/distemist.py b/examples/old_examples/distemist/distemist.py similarity index 100% rename from examples/dicts/distemist.py rename to examples/old_examples/distemist/distemist.py diff --git a/examples/distemist_bioasq.yaml b/examples/old_examples/distemist/distemist_bioasq.yaml similarity index 100% rename from examples/distemist_bioasq.yaml rename to examples/old_examples/distemist/distemist_bioasq.yaml diff --git a/examples/notebook_util.py b/examples/old_examples/distemist/notebook_util.py similarity index 100% rename from examples/notebook_util.py rename to examples/old_examples/distemist/notebook_util.py diff --git a/examples/util.py b/examples/util.py new file mode 100644 index 0000000..a62927f --- /dev/null +++ b/examples/util.py @@ -0,0 +1,15 @@ +import pandas as pd + +def get_dataframe(predictions, kb): + ents = [] + for d in predictions: + for e in d['entities']: + span = ' '.join(e['text']) + label = e['type'] + top_concept = e['normalized'][0] if len(e['normalized']) > 0 else None + if top_concept: + cui = top_concept['db_id'] + ents.append({'mention' : span, 'class' : label, 'cui' : cui, 'canonical name' : kb.cui_to_entity[cui].canonical_name, 'linked by' : top_concept['predicted_by'], 'score' : top_concept['score']}) + else: + ents.append({'mention' : span, 'class' : label, 'cui' : 'Not linkable'}) + return pd.DataFrame(ents) \ No newline at end of file From b74a252667a7313f53cb11595d870e415d624ee0 Mon Sep 17 00:00:00 2001 From: Florian Borchert Date: Fri, 5 Jan 2024 14:09:04 +0100 Subject: [PATCH 2/7] Readme --- README.md | 4 +++ examples/03_SNOMED_Linking_German.ipynb | 36 ++++++++++--------------- examples/README.md | 23 ++++++++++++---- 3 files changed, 36 insertions(+), 27 deletions(-) diff --git a/README.md b/README.md index 4653b30..e1dda99 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,10 @@ We use [Poetry](https://python-poetry.org/) for building, testing and dependency A very simple pipeline highlighting the main components of xMEN can be found in [notebooks/00_Getting_Started.ipynb](notebooks/00_Getting_Started.ipynb) +## 🎓 Examples + +For more advanced use cases, check out the [examples](examples) folder. + ## 📂 Data Loading Usually, BigBIO-compatible datasets can just be loaded from the Hugging Face Hub: diff --git a/examples/03_SNOMED_Linking_German.ipynb b/examples/03_SNOMED_Linking_German.ipynb index 3f43598..cc723c2 100644 --- a/examples/03_SNOMED_Linking_German.ipynb +++ b/examples/03_SNOMED_Linking_German.ipynb @@ -345,11 +345,11 @@ { "data": { "text/html": [ - "
[01/05/24 13:54:14] INFO     Loading hierarchical faiss index                                sap_bert_linker.py:153\n",
+       "
[01/05/24 14:06:56] INFO     Loading hierarchical faiss index                                sap_bert_linker.py:153\n",
        "
\n" ], "text/plain": [ - "\u001b[2;36m[01/05/24 13:54:14]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Loading hierarchical faiss index \u001b]8;id=183098;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/sap_bert_linker.py\u001b\\\u001b[2msap_bert_linker.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=912029;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/sap_bert_linker.py#153\u001b\\\u001b[2m153\u001b[0m\u001b]8;;\u001b\\\n" + "\u001b[2;36m[01/05/24 14:06:56]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Loading hierarchical faiss index \u001b]8;id=377106;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/sap_bert_linker.py\u001b\\\u001b[2msap_bert_linker.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=654858;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/sap_bert_linker.py#153\u001b\\\u001b[2m153\u001b[0m\u001b]8;;\u001b\\\n" ] }, "metadata": {}, @@ -364,7 +364,7 @@ "
\n" ], "text/plain": [ - "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Loading index from \u001b]8;id=599606;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py\u001b\\\u001b[2mfaiss_indexer.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=458220;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py#64\u001b\\\u001b[2m64\u001b[0m\u001b]8;;\u001b\\\n", + "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Loading index from \u001b]8;id=487488;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py\u001b\\\u001b[2mfaiss_indexer.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=593198;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py#64\u001b\\\u001b[2m64\u001b[0m\u001b]8;;\u001b\\\n", "\u001b[2;36m \u001b[0m \u001b[35m/home/Florian.Borchert/.cache/xmen/snomed_german/index/sapbert/\u001b[0m\u001b[95memb\u001b[0m \u001b[2m \u001b[0m\n", "\u001b[2;36m \u001b[0m \u001b[95med_faiss_hier.pickle\u001b[0m \u001b[2m \u001b[0m\n" ] @@ -375,12 +375,12 @@ { "data": { "text/html": [ - "
[01/05/24 13:54:18] INFO     Loaded index of type <class 'faiss.swigfaiss.IndexHNSWFlat'> and   faiss_indexer.py:66\n",
+       "
[01/05/24 14:06:59] INFO     Loaded index of type <class 'faiss.swigfaiss.IndexHNSWFlat'> and   faiss_indexer.py:66\n",
        "                             size 1967771                                                                          \n",
        "
\n" ], "text/plain": [ - "\u001b[2;36m[01/05/24 13:54:18]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Loaded index of type \u001b[1m<\u001b[0m\u001b[1;95mclass\u001b[0m\u001b[39m \u001b[0m\u001b[32m'faiss.swigfaiss.IndexHNSWFlat'\u001b[0m\u001b[1m>\u001b[0m and \u001b]8;id=776198;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py\u001b\\\u001b[2mfaiss_indexer.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=995347;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py#66\u001b\\\u001b[2m66\u001b[0m\u001b]8;;\u001b\\\n", + "\u001b[2;36m[01/05/24 14:06:59]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Loaded index of type \u001b[1m<\u001b[0m\u001b[1;95mclass\u001b[0m\u001b[39m \u001b[0m\u001b[32m'faiss.swigfaiss.IndexHNSWFlat'\u001b[0m\u001b[1m>\u001b[0m and \u001b]8;id=951108;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py\u001b\\\u001b[2mfaiss_indexer.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=612483;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py#66\u001b\\\u001b[2m66\u001b[0m\u001b]8;;\u001b\\\n", "\u001b[2;36m \u001b[0m size \u001b[1;36m1967771\u001b[0m \u001b[2m \u001b[0m\n" ] }, @@ -390,7 +390,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "b477d6652c9e4326b9d9ab5f651d24a8", + "model_id": "6d71624d3cf4489eb2925db2738cf87f", "version_major": 2, "version_minor": 0 }, @@ -698,7 +698,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "f48fee7d065841388efc407f3aeaeac5", + "model_id": "18e52613deee4c73984228a078099896", "version_major": 2, "version_minor": 0 }, @@ -712,7 +712,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "fa32aa44068041bb97bd5ebfaaf77248", + "model_id": "1d389d47d62a49a89af6ce30ccbfb23c", "version_major": 2, "version_minor": 0 }, @@ -726,7 +726,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "aff7ad16c32648c5afb548173e64dfa9", + "model_id": "4df5f4cd32fa4313bdd86e140f50d2a9", "version_major": 2, "version_minor": 0 }, @@ -740,7 +740,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "237c7aa368b6418fbf9d7b1bb34d9c8e", + "model_id": "fa0789b1e274435c891bc8c9807475fa", "version_major": 2, "version_minor": 0 }, @@ -775,7 +775,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "b9a550a3efd34c76a001844587b21087", + "model_id": "ff5b9aaf85934a6bbdddd51fbd60fe85", "version_major": 2, "version_minor": 0 }, @@ -789,7 +789,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "bd911340658d487087b452f3e87c1d68", + "model_id": "202158acb5a949fda2d10d31774680f2", "version_major": 2, "version_minor": 0 }, @@ -803,7 +803,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "5986edf5b65e4fd3a53ecb22d4728ace", + "model_id": "e00a9111e44d4fdcb5271c065f60b3b3", "version_major": 2, "version_minor": 0 }, @@ -817,7 +817,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "a7a6159291d24e42a63e6e0a647dc41f", + "model_id": "dd32237ae8f545b59e9fe32588a7b0ff", "version_major": 2, "version_minor": 0 }, @@ -1075,14 +1075,6 @@ "get_dataframe(reranked, kb)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "9ace9bf0-c0ef-4c9e-9bd6-6ecce3cf03b6", - "metadata": {}, - "outputs": [], - "source": [] - }, { "cell_type": "code", "execution_count": null, diff --git a/examples/README.md b/examples/README.md index d8f8a05..f100ff5 100644 --- a/examples/README.md +++ b/examples/README.md @@ -1,13 +1,26 @@ # Examples +|Link|Language|Description| +|---|---|---| +|[01_BRONCO_German.ipynb](01_BRONCO_German.ipynb)|🇩🇪|Candidate generation and supervised re-ranking using the BRONCO corpus.
Shows how you can configure multiple dictionaries in the same config file.| +|[02_GGPONC_German.ipynb](02_GGPONC_German.ipynb)|🇩🇪|Using a spaCy NER model with xMEN
Shows how to build a pipeline without labelled data using candidate generation, type filtering and pre-trained re-rankers| +|[03_SNOMED_Linking_German.ipynb](03_SNOMED_Linking_German.ipynb)|🇩🇪|Linking against codes in UMLS source vocabularies (here SNOMED CT)| + +## External Links + |Link|Description| |---|---| -|[01_BRONCO_German.ipynb](01_BRONCO_German.ipynb)|Candidate generation and supervised re-ranking using the BRONCO corpus.
Shows how you can configure multiple dictionaries in the same config file.| -|[02_GGPONC_German.ipynb](02_GGPONC_German.ipynb)|Using a spaCy NER model with xMEN
Shows how to build a pipeline without labelled data using candidate generation, type filtering and pre-trained re-rankers| -|[03_SNOMED_Linking_German.ipynb](03_SNOMED_Linking_German.ipynb)|Linking against codes in UMLS source vocabularies (here SNOMED CT)| -| | | +| https://github.com/hpi-dhc/symptemist_biocreative_2023 | 🇪🇸 | BioCreative VIII SympTEMIST Challenge (1st place in entity linking track) | ## Benchmarks -More examples for configurations can be found in the [Benchmarks](../benchmarks) folder. \ No newline at end of file +More examples for configurations can be found in the [Benchmarks](../benchmarks) folder. + +|Benchmark|Language| +|---|---| +|[Quaero](../benchmarks/benchmark/quaero.yaml)|🇫🇷| +|[MedMentions](../benchmarks/benchmark/medmentions_en.yaml)|🇬🇧| +|[DisTEMIST](../benchmarks/benchmark/distemist.yaml)|🇪🇸| +|[BRONCO](../benchmarks/benchmark/bronco.yaml)|🇩🇪| +|[Mantra](../benchmarks/benchmark/mantra.yaml)|🇬🇧 🇫🇷 🇪🇸 🇩🇪 🇳🇱| \ No newline at end of file From a846fbdb925fcc3048ec55e83090a9bf15b8db56 Mon Sep 17 00:00:00 2001 From: Florian Borchert Date: Fri, 5 Jan 2024 14:10:03 +0100 Subject: [PATCH 3/7] Update README.md --- examples/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/README.md b/examples/README.md index f100ff5..41b843f 100644 --- a/examples/README.md +++ b/examples/README.md @@ -3,7 +3,7 @@ |Link|Language|Description| |---|---|---| |[01_BRONCO_German.ipynb](01_BRONCO_German.ipynb)|🇩🇪|Candidate generation and supervised re-ranking using the BRONCO corpus.
Shows how you can configure multiple dictionaries in the same config file.| -|[02_GGPONC_German.ipynb](02_GGPONC_German.ipynb)|🇩🇪|Using a spaCy NER model with xMEN
Shows how to build a pipeline without labelled data using candidate generation, type filtering and pre-trained re-rankers| +|[02_spaCy_German.ipynb](02_spaCy_German.ipynb)|🇩🇪|Using a spaCy NER model with xMEN
Shows how to build a pipeline without labelled data using candidate generation, type filtering and pre-trained re-rankers| |[03_SNOMED_Linking_German.ipynb](03_SNOMED_Linking_German.ipynb)|🇩🇪|Linking against codes in UMLS source vocabularies (here SNOMED CT)| ## External Links @@ -23,4 +23,4 @@ More examples for configurations can be found in the [Benchmarks](../benchmarks) |[MedMentions](../benchmarks/benchmark/medmentions_en.yaml)|🇬🇧| |[DisTEMIST](../benchmarks/benchmark/distemist.yaml)|🇪🇸| |[BRONCO](../benchmarks/benchmark/bronco.yaml)|🇩🇪| -|[Mantra](../benchmarks/benchmark/mantra.yaml)|🇬🇧 🇫🇷 🇪🇸 🇩🇪 🇳🇱| \ No newline at end of file +|[Mantra](../benchmarks/benchmark/mantra.yaml)|🇬🇧 🇫🇷 🇪🇸 🇩🇪 🇳🇱| From 7db95a5a8ac77f919de765b8e87a0ae276d33b54 Mon Sep 17 00:00:00 2001 From: Florian Borchert Date: Fri, 5 Jan 2024 14:11:29 +0100 Subject: [PATCH 4/7] Update README.md --- examples/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/README.md b/examples/README.md index 41b843f..82dd450 100644 --- a/examples/README.md +++ b/examples/README.md @@ -8,8 +8,8 @@ ## External Links -|Link|Description| -|---|---| +|Link|Language|Description| +|---|---|---| | https://github.com/hpi-dhc/symptemist_biocreative_2023 | 🇪🇸 | BioCreative VIII SympTEMIST Challenge (1st place in entity linking track) | From cc22698f426b27792215500e5acbd41535e4d67d Mon Sep 17 00:00:00 2001 From: Florian Borchert Date: Fri, 5 Jan 2024 14:12:19 +0100 Subject: [PATCH 5/7] Update README.md --- examples/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/README.md b/examples/README.md index 82dd450..a0741ab 100644 --- a/examples/README.md +++ b/examples/README.md @@ -15,7 +15,7 @@ ## Benchmarks -More examples for configurations can be found in the [Benchmarks](../benchmarks) folder. +More examples for configurations can be found in the [Benchmarks](../benchmarks/benchmark) folder. |Benchmark|Language| |---|---| From 04a414bd2ac6094991d579ef965e077df73fb78e Mon Sep 17 00:00:00 2001 From: Florian Borchert Date: Fri, 5 Jan 2024 14:30:11 +0100 Subject: [PATCH 6/7] Black --- examples/dicts/umls_source.py | 25 +++++++++++++++++-------- examples/util.py | 26 ++++++++++++++++++-------- 2 files changed, 35 insertions(+), 16 deletions(-) diff --git a/examples/dicts/umls_source.py b/examples/dicts/umls_source.py index a32222d..7f8ab32 100644 --- a/examples/dicts/umls_source.py +++ b/examples/dicts/umls_source.py @@ -3,6 +3,7 @@ from collections import defaultdict from xmen.log import logger + def get_concept_details(cfg): mrconso = "MRCONSO.RRF" concept_details = {} @@ -23,12 +24,14 @@ def get_concept_details(cfg): cui = concept["CUI"] if id_key: sid = concept[id_key] - else: + else: sid = concept["SDUI"] if not sid: sid = concept["SCUI"] if not sid: - logger.warn(f"Skipping concept with CUI {cui} because we could not find a valid source vocabulary ID") + logger.warn( + f"Skipping concept with CUI {cui} because we could not find a valid source vocabulary ID" + ) name = concept["STR"] if sid in concept_details: concept_details[sid]["aliases"].append(name) @@ -36,19 +39,25 @@ def get_concept_details(cfg): concept_details[sid] = {"concept_id": sid, "canonical_name": name, "types": [], "aliases": []} scui2cui[sid].append(cui) - if umls_extend := cfg.dict.custom.get('umls_extend'): + if umls_extend := cfg.dict.custom.get("umls_extend"): # Optionally extend with UMLS synonyms - other_umls_concepts = get_umls_concepts(meta_path, - umls_extend.get("lang"), sabs=umls_extend.get("sabs"), sources=umls_extend.get("sources"), semantic_groups=umls_extend.get("semantic_groups"), semantic_types=umls_extend.get("semantic_types")) + other_umls_concepts = get_umls_concepts( + meta_path, + umls_extend.get("lang"), + sabs=umls_extend.get("sabs"), + sources=umls_extend.get("sources"), + semantic_groups=umls_extend.get("semantic_groups"), + semantic_types=umls_extend.get("semantic_types"), + ) for scui, concept in tqdm(concept_details.items()): for cui in scui2cui[scui]: if cui in other_umls_concepts: - for t in other_umls_concepts[cui]['types']: + for t in other_umls_concepts[cui]["types"]: if not t in concept["types"]: concept["types"].append(t) for new_alias in other_umls_concepts[cui]["aliases"] + [other_umls_concepts[cui]["canonical_name"]]: if new_alias not in concept["aliases"] and new_alias != concept["canonical_name"]: - concept["aliases"].append(new_alias) - + concept["aliases"].append(new_alias) + return concept_details diff --git a/examples/util.py b/examples/util.py index a62927f..1f9f7d4 100644 --- a/examples/util.py +++ b/examples/util.py @@ -1,15 +1,25 @@ import pandas as pd + def get_dataframe(predictions, kb): ents = [] for d in predictions: - for e in d['entities']: - span = ' '.join(e['text']) - label = e['type'] - top_concept = e['normalized'][0] if len(e['normalized']) > 0 else None + for e in d["entities"]: + span = " ".join(e["text"]) + label = e["type"] + top_concept = e["normalized"][0] if len(e["normalized"]) > 0 else None if top_concept: - cui = top_concept['db_id'] - ents.append({'mention' : span, 'class' : label, 'cui' : cui, 'canonical name' : kb.cui_to_entity[cui].canonical_name, 'linked by' : top_concept['predicted_by'], 'score' : top_concept['score']}) + cui = top_concept["db_id"] + ents.append( + { + "mention": span, + "class": label, + "cui": cui, + "canonical name": kb.cui_to_entity[cui].canonical_name, + "linked by": top_concept["predicted_by"], + "score": top_concept["score"], + } + ) else: - ents.append({'mention' : span, 'class' : label, 'cui' : 'Not linkable'}) - return pd.DataFrame(ents) \ No newline at end of file + ents.append({"mention": span, "class": label, "cui": "Not linkable"}) + return pd.DataFrame(ents) From 7310147cef3e480b384036fcd2bc6eac2f2c7b70 Mon Sep 17 00:00:00 2001 From: Florian Borchert Date: Fri, 5 Jan 2024 15:01:31 +0100 Subject: [PATCH 7/7] Move DisTEMIST --- examples/old_examples/conf/distemist.yaml | 7 ------- .../01_BioASQ_DisTEMIST.ipynb | 0 .../{distemist => distemist_clef2023}/distemist.py | 0 .../distemist_bioasq.yaml | 0 .../{distemist => distemist_clef2023}/notebook_util.py | 0 5 files changed, 7 deletions(-) delete mode 100644 examples/old_examples/conf/distemist.yaml rename examples/old_examples/{distemist => distemist_clef2023}/01_BioASQ_DisTEMIST.ipynb (100%) rename examples/old_examples/{distemist => distemist_clef2023}/distemist.py (100%) rename examples/old_examples/{distemist => distemist_clef2023}/distemist_bioasq.yaml (100%) rename examples/old_examples/{distemist => distemist_clef2023}/notebook_util.py (100%) diff --git a/examples/old_examples/conf/distemist.yaml b/examples/old_examples/conf/distemist.yaml deleted file mode 100644 index a491cae..0000000 --- a/examples/old_examples/conf/distemist.yaml +++ /dev/null @@ -1,7 +0,0 @@ -name: distemist - -dict: - custom: - lang: - - es - distemist_path: local_files/dictionary_distemist.tsv \ No newline at end of file diff --git a/examples/old_examples/distemist/01_BioASQ_DisTEMIST.ipynb b/examples/old_examples/distemist_clef2023/01_BioASQ_DisTEMIST.ipynb similarity index 100% rename from examples/old_examples/distemist/01_BioASQ_DisTEMIST.ipynb rename to examples/old_examples/distemist_clef2023/01_BioASQ_DisTEMIST.ipynb diff --git a/examples/old_examples/distemist/distemist.py b/examples/old_examples/distemist_clef2023/distemist.py similarity index 100% rename from examples/old_examples/distemist/distemist.py rename to examples/old_examples/distemist_clef2023/distemist.py diff --git a/examples/old_examples/distemist/distemist_bioasq.yaml b/examples/old_examples/distemist_clef2023/distemist_bioasq.yaml similarity index 100% rename from examples/old_examples/distemist/distemist_bioasq.yaml rename to examples/old_examples/distemist_clef2023/distemist_bioasq.yaml diff --git a/examples/old_examples/distemist/notebook_util.py b/examples/old_examples/distemist_clef2023/notebook_util.py similarity index 100% rename from examples/old_examples/distemist/notebook_util.py rename to examples/old_examples/distemist_clef2023/notebook_util.py