From 253a477c32587093286297b885c3db0a69a224ad Mon Sep 17 00:00:00 2001
From: Florian Borchert <Florian.Borchert@hpi.de>
Date: Fri, 5 Jan 2024 13:57:40 +0100
Subject: [PATCH 1/7] Update Examples

---
 ...02_BRONCO.ipynb => 01_BRONCO_German.ipynb} |   11 -
 ...GGPONC_NER.ipynb => 02_spaCy_German.ipynb} |  148 +--
 examples/03_SNOMED_Linking_German.ipynb       | 1116 +++++++++++++++++
 examples/README.md                            |   13 +
 examples/Temp.ipynb                           |  216 ----
 examples/conf/meddra_german.yaml              |    8 +-
 examples/conf/snomed_german.yaml              |   24 +
 examples/dicts/umls_source.py                 |   35 +-
 examples/ggponc2tui.csv                       |  128 --
 examples/ggponc_tuis.csv                      |  414 ++++++
 .../{ => old_examples}/conf/distemist.yaml    |    0
 .../distemist}/01_BioASQ_DisTEMIST.ipynb      |    0
 .../distemist}/distemist.py                   |    0
 .../distemist}/distemist_bioasq.yaml          |    0
 .../distemist}/notebook_util.py               |    0
 examples/util.py                              |   15 +
 16 files changed, 1671 insertions(+), 457 deletions(-)
 rename examples/{02_BRONCO.ipynb => 01_BRONCO_German.ipynb} (99%)
 rename examples/{03_GGPONC_NER.ipynb => 02_spaCy_German.ipynb} (93%)
 create mode 100644 examples/03_SNOMED_Linking_German.ipynb
 create mode 100644 examples/README.md
 delete mode 100644 examples/Temp.ipynb
 create mode 100644 examples/conf/snomed_german.yaml
 delete mode 100644 examples/ggponc2tui.csv
 create mode 100644 examples/ggponc_tuis.csv
 rename examples/{ => old_examples}/conf/distemist.yaml (100%)
 rename examples/{ => old_examples/distemist}/01_BioASQ_DisTEMIST.ipynb (100%)
 rename examples/{dicts => old_examples/distemist}/distemist.py (100%)
 rename examples/{ => old_examples/distemist}/distemist_bioasq.yaml (100%)
 rename examples/{ => old_examples/distemist}/notebook_util.py (100%)
 create mode 100644 examples/util.py
diff --git a/examples/02_BRONCO.ipynb b/examples/01_BRONCO_German.ipynb
similarity index 99%
rename from examples/02_BRONCO.ipynb
rename to examples/01_BRONCO_German.ipynb
index 1e0fcf4..6f4dcab 100644
--- a/examples/02_BRONCO.ipynb
+++ b/examples/01_BRONCO_German.ipynb
@@ -1,16 +1,5 @@
 {
  "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "0271244e-11b2-44ab-82e8-cf4cdcad6b13",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "os.environ['CUDA_VISIBLE_DEVICES'] = '5'"
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "eb84db40-b21a-4fba-8174-a327a831ef76",
diff --git a/examples/03_GGPONC_NER.ipynb b/examples/02_spaCy_German.ipynb
similarity index 93%
rename from examples/03_GGPONC_NER.ipynb
rename to examples/02_spaCy_German.ipynb
index 7725e73..3f768eb 100644
--- a/examples/03_GGPONC_NER.ipynb
+++ b/examples/02_spaCy_German.ipynb
@@ -5,7 +5,7 @@
    "id": "6a3e8444-54ec-41a2-aaef-3849a22cde57",
    "metadata": {},
    "source": [
-    "# Combinining NER models with xMEN for German Clinical Entity Linking"
+    "# Combinining spaCy NER models with xMEN for German Clinical Entity Linking"
    ]
   },
   {
@@ -21,7 +21,7 @@
    "id": "68020a8a-9c53-4f75-a08e-5078ec5b7f35",
    "metadata": {},
    "source": [
-    "### Download NER model"
+    "### Download NER Model"
    ]
   },
   {
@@ -70,7 +70,7 @@
    "id": "407e49a2-7a40-4982-bf1b-d15de505d039",
    "metadata": {},
    "source": [
-    "# Run GGPONC NER Model on sample data"
+    "## Run spaCy NER Model on Sample Data"
    ]
   },
   {
@@ -290,7 +290,7 @@
    "id": "0ae82793-8135-41a2-b6f5-75e39e1e57b1",
    "metadata": {},
    "source": [
-    "# Run Entity Linker"
+    "## Candidate Generation"
    ]
   },
   {
@@ -381,11 +381,11 @@
     {
      "data": {
       "text/html": [
-       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">[01/04/24 15:04:26] </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO    </span> Loading hierarchical faiss index                                <a href=\"file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/sap_bert_linker.py\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">sap_bert_linker.py</span></a><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">:</span><a href=\"file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/sap_bert_linker.py#153\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">153</span></a>\n",
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">[01/05/24 13:54:34] </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO    </span> Loading hierarchical faiss index                                <a href=\"file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/sap_bert_linker.py\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">sap_bert_linker.py</span></a><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">:</span><a href=\"file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/sap_bert_linker.py#153\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">153</span></a>\n",
        "</pre>\n"
       ],
       "text/plain": [
-       "\u001b[2;36m[01/04/24 15:04:26]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO    \u001b[0m Loading hierarchical faiss index                                \u001b]8;id=605687;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/sap_bert_linker.py\u001b\\\u001b[2msap_bert_linker.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=745349;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/sap_bert_linker.py#153\u001b\\\u001b[2m153\u001b[0m\u001b]8;;\u001b\\\n"
+       "\u001b[2;36m[01/05/24 13:54:34]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO    \u001b[0m Loading hierarchical faiss index                                \u001b]8;id=378882;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/sap_bert_linker.py\u001b\\\u001b[2msap_bert_linker.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=337023;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/sap_bert_linker.py#153\u001b\\\u001b[2m153\u001b[0m\u001b]8;;\u001b\\\n"
       ]
      },
      "metadata": {},
@@ -400,7 +400,7 @@
        "</pre>\n"
       ],
       "text/plain": [
-       "\u001b[2;36m                   \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO    \u001b[0m Loading index from                                                 \u001b]8;id=911485;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py\u001b\\\u001b[2mfaiss_indexer.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=711460;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py#64\u001b\\\u001b[2m64\u001b[0m\u001b]8;;\u001b\\\n",
+       "\u001b[2;36m                   \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO    \u001b[0m Loading index from                                                 \u001b]8;id=290282;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py\u001b\\\u001b[2mfaiss_indexer.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=834572;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py#64\u001b\\\u001b[2m64\u001b[0m\u001b]8;;\u001b\\\n",
        "\u001b[2;36m                    \u001b[0m         \u001b[35m/home/Florian.Borchert/.cache/xmen/ggponc/index/sapbert/\u001b[0m\u001b[95membed_fais\u001b[0m \u001b[2m                   \u001b[0m\n",
        "\u001b[2;36m                    \u001b[0m         \u001b[95ms_hier.pickle\u001b[0m                                                      \u001b[2m                   \u001b[0m\n"
       ]
@@ -411,12 +411,12 @@
     {
      "data": {
       "text/html": [
-       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">[01/04/24 15:04:30] </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO    </span> Loaded index of type <span style=\"font-weight: bold\">&lt;</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff; font-weight: bold\">class</span><span style=\"color: #000000; text-decoration-color: #000000\"> </span><span style=\"color: #008000; text-decoration-color: #008000\">'faiss.swigfaiss.IndexHNSWFlat'</span><span style=\"font-weight: bold\">&gt;</span> and   <a href=\"file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">faiss_indexer.py</span></a><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">:</span><a href=\"file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py#66\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">66</span></a>\n",
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">[01/05/24 13:54:39] </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO    </span> Loaded index of type <span style=\"font-weight: bold\">&lt;</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff; font-weight: bold\">class</span><span style=\"color: #000000; text-decoration-color: #000000\"> </span><span style=\"color: #008000; text-decoration-color: #008000\">'faiss.swigfaiss.IndexHNSWFlat'</span><span style=\"font-weight: bold\">&gt;</span> and   <a href=\"file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">faiss_indexer.py</span></a><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">:</span><a href=\"file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py#66\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">66</span></a>\n",
        "<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">                    </span>         size <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">2906321</span>                                                       <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">                   </span>\n",
        "</pre>\n"
       ],
       "text/plain": [
-       "\u001b[2;36m[01/04/24 15:04:30]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO    \u001b[0m Loaded index of type \u001b[1m<\u001b[0m\u001b[1;95mclass\u001b[0m\u001b[39m \u001b[0m\u001b[32m'faiss.swigfaiss.IndexHNSWFlat'\u001b[0m\u001b[1m>\u001b[0m and   \u001b]8;id=874363;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py\u001b\\\u001b[2mfaiss_indexer.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=363764;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py#66\u001b\\\u001b[2m66\u001b[0m\u001b]8;;\u001b\\\n",
+       "\u001b[2;36m[01/05/24 13:54:39]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO    \u001b[0m Loaded index of type \u001b[1m<\u001b[0m\u001b[1;95mclass\u001b[0m\u001b[39m \u001b[0m\u001b[32m'faiss.swigfaiss.IndexHNSWFlat'\u001b[0m\u001b[1m>\u001b[0m and   \u001b]8;id=653504;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py\u001b\\\u001b[2mfaiss_indexer.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=376608;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py#66\u001b\\\u001b[2m66\u001b[0m\u001b]8;;\u001b\\\n",
        "\u001b[2;36m                    \u001b[0m         size \u001b[1;36m2906321\u001b[0m                                                       \u001b[2m                   \u001b[0m\n"
       ]
      },
@@ -438,7 +438,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "e3e916549d2f43c6a2d5414996f10447",
+       "model_id": "6bfe4331629c4563b9f39cf1a1e05481",
        "version_major": 2,
        "version_minor": 0
       },
@@ -463,7 +463,7 @@
    "id": "b3689a61-2ee1-4c81-afc7-6f6751654020",
    "metadata": {},
    "source": [
-    "## Semantic Type Filtering\n",
+    "### Semantic Type Filtering\n",
     "\n",
     "We filter the generated output to make sure the semantic type of the predicted concepts actually matches the semantic class of the named entity.\n",
     "\n",
@@ -502,33 +502,22 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "tui_df = pd.read_csv('ggponc2tui.csv')\n",
-    "type2tui = {}\n",
-    "for c in ['Diagnosis_or_Pathology', 'Other_Finding', 'Clinical_Drug', 'Nutrient_or_Body_Substance',\n",
-    "       'External_Substance', 'Therapeutic', 'Diagnostic']:\n",
-    "    type2tui[c] = list(tui_df.TUI[tui_df[c] == 'x'].values)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 18,
-   "id": "0b7f48f2-9b8d-41f7-a526-49b9ff318170",
-   "metadata": {},
-   "outputs": [],
-   "source": [
+    "from xmen.data import SemanticTypeFilter\n",
+    "\n",
+    "type2tui = pd.read_csv('ggponc_tuis.csv').groupby('class')['tui'].apply(list).to_dict()\n",
     "type_filter = SemanticTypeFilter(type2tui, kb)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 18,
    "id": "14ebda62-a868-4bc9-8c84-a0580935b0c7",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "c5305b0a8d1545d093edfd187ae62333",
+       "model_id": "fe0b1a358a4d4df39539c448e28845b4",
        "version_major": 2,
        "version_minor": 0
       },
@@ -546,7 +535,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 19,
    "id": "308b40e7-d97a-4b58-b933-ecc8083fcc97",
    "metadata": {},
    "outputs": [
@@ -574,7 +563,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 20,
    "id": "bc358d6c-c1a6-4b8a-9839-3a527c3f2a67",
    "metadata": {},
    "outputs": [
@@ -600,40 +589,10 @@
     "print(kb.cui_to_entity['C1739039'])"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "id": "b43d94d5-63b6-478a-8bc6-fdee335c9046",
-   "metadata": {},
-   "source": [
-    "## Output"
-   ]
-  },
   {
    "cell_type": "code",
-   "execution_count": 22,
-   "id": "9b0c2d1d-727b-4f52-90be-dc468cf0f210",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def get_dataframe(predictions):\n",
-    "    ents = []\n",
-    "    for d in predictions:\n",
-    "        for e in d['entities']:\n",
-    "            span = ' '.join(e['text'])\n",
-    "            label = e['type']\n",
-    "            top_concept = e['normalized'][0] if len(e['normalized']) > 0 else None        \n",
-    "            if top_concept:\n",
-    "                cui = top_concept['db_id']\n",
-    "                ents.append({'mention' : span, 'class' :  label, 'cui' : cui, 'canonical name' : kb.cui_to_entity[cui].canonical_name, 'linked by' : top_concept['predicted_by'], 'score' : top_concept['score']})\n",
-    "            else:\n",
-    "                ents.append({'mention' : span, 'class' :  label, 'cui' : 'Not linkable'})\n",
-    "    return pd.DataFrame(ents)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 23,
-   "id": "c3a659c4-0371-4b6f-9544-8b1fc6615c61",
+   "execution_count": 21,
+   "id": "c02ef4fd-dc7f-487d-b059-8643b3886ba5",
    "metadata": {},
    "outputs": [
     {
@@ -846,13 +805,14 @@
        "13  Plattenepithelkarzinom der Mundhoehle         [sap]  0.987126  "
       ]
      },
-     "execution_count": 23,
+     "execution_count": 21,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "get_dataframe(filtered_prediction)"
+    "from util import get_dataframe\n",
+    "get_dataframe(filtered_prediction, kb)"
    ]
   },
   {
@@ -865,7 +825,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 22,
    "id": "9b7b5c5f-e617-4975-822a-fa6bce38397a",
    "metadata": {},
    "outputs": [
@@ -884,11 +844,11 @@
     {
      "data": {
       "text/html": [
-       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">[01/04/24 15:04:58] </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO    </span> Loading hierarchical faiss index                                <a href=\"file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/sap_bert_linker.py\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">sap_bert_linker.py</span></a><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">:</span><a href=\"file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/sap_bert_linker.py#153\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">153</span></a>\n",
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">[01/05/24 13:55:12] </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO    </span> Loading hierarchical faiss index                                <a href=\"file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/sap_bert_linker.py\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">sap_bert_linker.py</span></a><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">:</span><a href=\"file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/sap_bert_linker.py#153\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">153</span></a>\n",
        "</pre>\n"
       ],
       "text/plain": [
-       "\u001b[2;36m[01/04/24 15:04:58]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO    \u001b[0m Loading hierarchical faiss index                                \u001b]8;id=693868;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/sap_bert_linker.py\u001b\\\u001b[2msap_bert_linker.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=370078;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/sap_bert_linker.py#153\u001b\\\u001b[2m153\u001b[0m\u001b]8;;\u001b\\\n"
+       "\u001b[2;36m[01/05/24 13:55:12]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO    \u001b[0m Loading hierarchical faiss index                                \u001b]8;id=29828;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/sap_bert_linker.py\u001b\\\u001b[2msap_bert_linker.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=247780;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/sap_bert_linker.py#153\u001b\\\u001b[2m153\u001b[0m\u001b]8;;\u001b\\\n"
       ]
      },
      "metadata": {},
@@ -903,7 +863,7 @@
        "</pre>\n"
       ],
       "text/plain": [
-       "\u001b[2;36m                   \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO    \u001b[0m Loading index from                                                 \u001b]8;id=347044;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py\u001b\\\u001b[2mfaiss_indexer.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=137374;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py#64\u001b\\\u001b[2m64\u001b[0m\u001b]8;;\u001b\\\n",
+       "\u001b[2;36m                   \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO    \u001b[0m Loading index from                                                 \u001b]8;id=165201;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py\u001b\\\u001b[2mfaiss_indexer.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=284277;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py#64\u001b\\\u001b[2m64\u001b[0m\u001b]8;;\u001b\\\n",
        "\u001b[2;36m                    \u001b[0m         \u001b[35m/home/Florian.Borchert/.cache/xmen/ggponc/index/sapbert/\u001b[0m\u001b[95membed_fais\u001b[0m \u001b[2m                   \u001b[0m\n",
        "\u001b[2;36m                    \u001b[0m         \u001b[95ms_hier.pickle\u001b[0m                                                      \u001b[2m                   \u001b[0m\n"
       ]
@@ -914,12 +874,12 @@
     {
      "data": {
       "text/html": [
-       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">[01/04/24 15:05:02] </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO    </span> Loaded index of type <span style=\"font-weight: bold\">&lt;</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff; font-weight: bold\">class</span><span style=\"color: #000000; text-decoration-color: #000000\"> </span><span style=\"color: #008000; text-decoration-color: #008000\">'faiss.swigfaiss.IndexHNSWFlat'</span><span style=\"font-weight: bold\">&gt;</span> and   <a href=\"file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">faiss_indexer.py</span></a><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">:</span><a href=\"file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py#66\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">66</span></a>\n",
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">[01/05/24 13:55:17] </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO    </span> Loaded index of type <span style=\"font-weight: bold\">&lt;</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff; font-weight: bold\">class</span><span style=\"color: #000000; text-decoration-color: #000000\"> </span><span style=\"color: #008000; text-decoration-color: #008000\">'faiss.swigfaiss.IndexHNSWFlat'</span><span style=\"font-weight: bold\">&gt;</span> and   <a href=\"file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">faiss_indexer.py</span></a><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">:</span><a href=\"file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py#66\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">66</span></a>\n",
        "<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">                    </span>         size <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">2906321</span>                                                       <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">                   </span>\n",
        "</pre>\n"
       ],
       "text/plain": [
-       "\u001b[2;36m[01/04/24 15:05:02]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO    \u001b[0m Loaded index of type \u001b[1m<\u001b[0m\u001b[1;95mclass\u001b[0m\u001b[39m \u001b[0m\u001b[32m'faiss.swigfaiss.IndexHNSWFlat'\u001b[0m\u001b[1m>\u001b[0m and   \u001b]8;id=496193;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py\u001b\\\u001b[2mfaiss_indexer.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=514230;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py#66\u001b\\\u001b[2m66\u001b[0m\u001b]8;;\u001b\\\n",
+       "\u001b[2;36m[01/05/24 13:55:17]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO    \u001b[0m Loaded index of type \u001b[1m<\u001b[0m\u001b[1;95mclass\u001b[0m\u001b[39m \u001b[0m\u001b[32m'faiss.swigfaiss.IndexHNSWFlat'\u001b[0m\u001b[1m>\u001b[0m and   \u001b]8;id=433500;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py\u001b\\\u001b[2mfaiss_indexer.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=574493;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py#66\u001b\\\u001b[2m66\u001b[0m\u001b]8;;\u001b\\\n",
        "\u001b[2;36m                    \u001b[0m         size \u001b[1;36m2906321\u001b[0m                                                       \u001b[2m                   \u001b[0m\n"
       ]
      },
@@ -934,14 +894,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 23,
    "id": "411c0814-5fc5-4b70-ab96-ef07833cbf08",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "b766e64187554e65916a818539a62f07",
+       "model_id": "80d4a77a89a848309a1342e4d8846505",
        "version_major": 2,
        "version_minor": 0
       },
@@ -955,7 +915,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "7b20de793c204aa2821b9c839ed74977",
+       "model_id": "b2be54262eab46b8963c3626a5c88891",
        "version_major": 2,
        "version_minor": 0
       },
@@ -973,7 +933,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 24,
    "id": "8185c65f-5645-4c38-a11a-a32a4f24a2a1",
    "metadata": {},
    "outputs": [],
@@ -983,7 +943,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 25,
    "id": "07d41590-f991-474a-9109-9b75d3184266",
    "metadata": {},
    "outputs": [
@@ -998,7 +958,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "720781390c534053b3b05f6d32f7ea6d",
+       "model_id": "c09461422f7a44f08091419edeb74ab0",
        "version_major": 2,
        "version_minor": 0
       },
@@ -1012,7 +972,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "7c893f7acd7745efa971e2da94fff95f",
+       "model_id": "48de214e279947e685cc324463bd7dca",
        "version_major": 2,
        "version_minor": 0
       },
@@ -1026,7 +986,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "4fbfd064903042359d6bd259c9885675",
+       "model_id": "fe4d431f45834f45b064a867540c59a6",
        "version_major": 2,
        "version_minor": 0
       },
@@ -1040,7 +1000,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "624d4b4e391346cabc7fee7ee24505ba",
+       "model_id": "b38b631271854036853f2e192174765b",
        "version_major": 2,
        "version_minor": 0
       },
@@ -1058,7 +1018,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 26,
    "id": "6613f881-9323-42c7-a63c-710c6a1d758c",
    "metadata": {},
    "outputs": [],
@@ -1068,14 +1028,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 27,
    "id": "3af8dfbf-14b1-4a47-b76b-14868cde55e6",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "0de5605717c84cbb9934f1f08b26abbf",
+       "model_id": "1c56209e5bbb4a3aa2a33b6db8358e64",
        "version_major": 2,
        "version_minor": 0
       },
@@ -1089,7 +1049,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "06074a2114af4bc58fe6a7b67c3fd4a8",
+       "model_id": "7d680c9f293f46a8939818239df7132d",
        "version_major": 2,
        "version_minor": 0
       },
@@ -1103,7 +1063,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "45751c18d9a7432f901ef25e98e0a110",
+       "model_id": "4238449d87694b36b0ec3116e43927c8",
        "version_major": 2,
        "version_minor": 0
       },
@@ -1117,7 +1077,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "5de8ca1225eb497bb70504d0b213e7fc",
+       "model_id": "ec464f1617704cb7997d30ae9748a9c7",
        "version_major": 2,
        "version_minor": 0
       },
@@ -1135,7 +1095,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 28,
    "id": "8bdfb06e-a38b-431d-b82a-c515e6a00078",
    "metadata": {},
    "outputs": [
@@ -1349,19 +1309,19 @@
        "13  Plattenepithelkarzinom der Mundhoehle  [ngram, sapbert]  0.987126  "
       ]
      },
-     "execution_count": 30,
+     "execution_count": 28,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "# Before Re-ranking\n",
-    "get_dataframe(candidates)"
+    "get_dataframe(candidates, kb)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 29,
    "id": "0f95ab2d-5f3b-432c-9214-64316b3f1e99",
    "metadata": {},
    "outputs": [
@@ -1575,23 +1535,15 @@
        "13        Plattenepithelkarzinom der Mundhoehle  [ngram, sapbert]  0.042749  "
       ]
      },
-     "execution_count": 31,
+     "execution_count": 29,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "# After Re-ranking\n",
-    "get_dataframe(reranked)"
+    "get_dataframe(reranked, kb)"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "9ace9bf0-c0ef-4c9e-9bd6-6ecce3cf03b6",
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
diff --git a/examples/03_SNOMED_Linking_German.ipynb b/examples/03_SNOMED_Linking_German.ipynb
new file mode 100644
index 0000000..3f43598
--- /dev/null
+++ b/examples/03_SNOMED_Linking_German.ipynb
@@ -0,0 +1,1116 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "6a3e8444-54ec-41a2-aaef-3849a22cde57",
+   "metadata": {},
+   "source": [
+    "# Linking Entities in German Medical Text to SNOMED CT"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "eace0372-6610-4a03-bab6-32e186911bac",
+   "metadata": {},
+   "source": [
+    "## Preparation\n",
+    "\n",
+    "### Download NER Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "50943603-e8d3-4157-a675-0d9f41f5479c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Consider using `hf_transfer` for faster downloads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details.\n",
+      "../local_files/de_ggponc_medbertde-any-py3-none-any.whl\n"
+     ]
+    }
+   ],
+   "source": [
+    "!huggingface-cli download phlobo/de_ggponc_medbertde de_ggponc_medbertde-any-py3-none-any.whl --local-dir ../local_files\n",
+    "!pip install -q ../local_files/de_ggponc_medbertde-any-py3-none-any.whl"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f216b159-7112-473e-a09e-6eff33fa7a34",
+   "metadata": {},
+   "source": [
+    "### Prepare Dicts and Index\n",
+    "\n",
+    "`xmen dict conf/snomed_german.yaml --code dicts/umls_source.py`\n",
+    "\n",
+    "`xmen index conf/snomed_german.yaml --all --overwrite`"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "407e49a2-7a40-4982-bf1b-d15de505d039",
+   "metadata": {},
+   "source": [
+    "## Entity Tagging"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "521b389e-ce31-4e1f-9e37-29986d475a82",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import spacy\n",
+    "nlp = spacy.load('de_ggponc_medbertde')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "8049cf19-4a70-4bac-af9a-416c4e7a60e5",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['Cetuximab ist ein monoklonaler Antikörper, der gegen den epidermalen Wachstumsfaktorrezeptor (EGFR) gerichtet ist unddient zur Therapie des fortgeschrittenen kolorektalen Karzinoms zusammen mit Irinotecan oder in Kombination mit FOLFOX bzw. allein nach Versagen einer Behandlung mit Oxaliplatin und Irinotecan.',\n",
+       " 'Die HPV-Diagnostik hat beim Plattenepithelkarzinom der Mundhöhle keinen validen Nutzen als prognostischer Faktor.']"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sentences = [\n",
+    "    \"Cetuximab ist ein monoklonaler Antikörper, der gegen den epidermalen Wachstumsfaktorrezeptor (EGFR) gerichtet ist und\" \\\n",
+    "       \"dient zur Therapie des fortgeschrittenen kolorektalen Karzinoms zusammen mit Irinotecan oder in Kombination mit FOLFOX bzw. \" \\\n",
+    "       \"allein nach Versagen einer Behandlung mit Oxaliplatin und Irinotecan.\",\n",
+    "    \"Die HPV-Diagnostik hat beim Plattenepithelkarzinom der Mundhöhle keinen validen Nutzen als prognostischer Faktor.\"\n",
+    "]\n",
+    "sentences"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "5feca36d-cff3-4e02-858e-8d46c0c3d579",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "docs = list(nlp.pipe(sentences))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "4a663fcf-c369-4e0c-9b90-c51fecba6019",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "bd0ab2ae-bd78-4c4f-bec7-e8fc119690e1",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>mention</th>\n",
+       "      <th>class</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Cetuximab</td>\n",
+       "      <td>Clinical_Drug</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>monoklonaler Antikörper</td>\n",
+       "      <td>Clinical_Drug</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>epidermalen Wachstumsfaktorrezeptor</td>\n",
+       "      <td>Nutrient_or_Body_Substance</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>EGFR</td>\n",
+       "      <td>Nutrient_or_Body_Substance</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Therapie des fortgeschrittenen kolorektalen Ka...</td>\n",
+       "      <td>Therapeutic</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>fortgeschrittenen kolorektalen Karzinoms</td>\n",
+       "      <td>Diagnosis_or_Pathology</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>Irinotecan</td>\n",
+       "      <td>Clinical_Drug</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>FOLFOX</td>\n",
+       "      <td>Therapeutic</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>Versagen einer Behandlung</td>\n",
+       "      <td>Other_Finding</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>Behandlung mit Oxaliplatin und Irinotecan</td>\n",
+       "      <td>Therapeutic</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>Oxaliplatin</td>\n",
+       "      <td>Clinical_Drug</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>Irinotecan</td>\n",
+       "      <td>Clinical_Drug</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>HPV-Diagnostik</td>\n",
+       "      <td>Diagnostic</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>Plattenepithelkarzinom der Mundhöhle</td>\n",
+       "      <td>Diagnosis_or_Pathology</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                              mention  \\\n",
+       "0                                           Cetuximab   \n",
+       "1                             monoklonaler Antikörper   \n",
+       "2                 epidermalen Wachstumsfaktorrezeptor   \n",
+       "3                                                EGFR   \n",
+       "4   Therapie des fortgeschrittenen kolorektalen Ka...   \n",
+       "5            fortgeschrittenen kolorektalen Karzinoms   \n",
+       "6                                          Irinotecan   \n",
+       "7                                              FOLFOX   \n",
+       "8                           Versagen einer Behandlung   \n",
+       "9           Behandlung mit Oxaliplatin und Irinotecan   \n",
+       "10                                        Oxaliplatin   \n",
+       "11                                         Irinotecan   \n",
+       "12                                     HPV-Diagnostik   \n",
+       "13               Plattenepithelkarzinom der Mundhöhle   \n",
+       "\n",
+       "                         class  \n",
+       "0                Clinical_Drug  \n",
+       "1                Clinical_Drug  \n",
+       "2   Nutrient_or_Body_Substance  \n",
+       "3   Nutrient_or_Body_Substance  \n",
+       "4                  Therapeutic  \n",
+       "5       Diagnosis_or_Pathology  \n",
+       "6                Clinical_Drug  \n",
+       "7                  Therapeutic  \n",
+       "8                Other_Finding  \n",
+       "9                  Therapeutic  \n",
+       "10               Clinical_Drug  \n",
+       "11               Clinical_Drug  \n",
+       "12                  Diagnostic  \n",
+       "13      Diagnosis_or_Pathology  "
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "ents = []\n",
+    "for d in docs:\n",
+    "    for span in sorted(d.spans['entities'], key=lambda s: s.start):\n",
+    "        ents.append({'mention' : span.text, 'class' : span.label_})\n",
+    "pd.DataFrame(ents)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0ae82793-8135-41a2-b6f5-75e39e1e57b1",
+   "metadata": {},
+   "source": [
+    "## Candidate Generation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "f5b0add5-9fb7-4eb4-9495-91ee5a5c95ea",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "from xmen.data import from_spacy\n",
+    "from xmen.linkers import SapBERTLinker, TFIDFNGramLinker, EnsembleLinker\n",
+    "from xmen import load_config"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "46831d66-e521-4225-9b92-a40d7b7582e4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset = from_spacy(docs, span_key='entities')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "5abb3126-ba45-47b1-a1a6-912652fefa3b",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Dataset({\n",
+       "    features: ['id', 'document_id', 'passages', 'entities', 'coreferences', 'relations', 'events', 'corpus_id', 'lang'],\n",
+       "    num_rows: 2\n",
+       "})"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "286a07dc-e5a3-42af-89f7-b48e035f0d25",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "conf = load_config('../examples/conf/snomed_german.yaml')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "ec6cd089-3a63-45e8-9178-86999e7f0441",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">[01/05/24 13:54:14] </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO    </span> Loading hierarchical faiss index                                <a href=\"file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/sap_bert_linker.py\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">sap_bert_linker.py</span></a><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">:</span><a href=\"file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/sap_bert_linker.py#153\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">153</span></a>\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[2;36m[01/05/24 13:54:14]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO    \u001b[0m Loading hierarchical faiss index                                \u001b]8;id=183098;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/sap_bert_linker.py\u001b\\\u001b[2msap_bert_linker.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=912029;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/sap_bert_linker.py#153\u001b\\\u001b[2m153\u001b[0m\u001b]8;;\u001b\\\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">                    </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO    </span> Loading index from                                                 <a href=\"file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">faiss_indexer.py</span></a><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">:</span><a href=\"file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py#64\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">64</span></a>\n",
+       "<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">                    </span>         <span style=\"color: #800080; text-decoration-color: #800080\">/home/Florian.Borchert/.cache/xmen/snomed_german/index/sapbert/</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">emb</span> <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">                   </span>\n",
+       "<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">                    </span>         <span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">ed_faiss_hier.pickle</span>                                               <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">                   </span>\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[2;36m                   \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO    \u001b[0m Loading index from                                                 \u001b]8;id=599606;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py\u001b\\\u001b[2mfaiss_indexer.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=458220;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py#64\u001b\\\u001b[2m64\u001b[0m\u001b]8;;\u001b\\\n",
+       "\u001b[2;36m                    \u001b[0m         \u001b[35m/home/Florian.Borchert/.cache/xmen/snomed_german/index/sapbert/\u001b[0m\u001b[95memb\u001b[0m \u001b[2m                   \u001b[0m\n",
+       "\u001b[2;36m                    \u001b[0m         \u001b[95med_faiss_hier.pickle\u001b[0m                                               \u001b[2m                   \u001b[0m\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">[01/05/24 13:54:18] </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO    </span> Loaded index of type <span style=\"font-weight: bold\">&lt;</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff; font-weight: bold\">class</span><span style=\"color: #000000; text-decoration-color: #000000\"> </span><span style=\"color: #008000; text-decoration-color: #008000\">'faiss.swigfaiss.IndexHNSWFlat'</span><span style=\"font-weight: bold\">&gt;</span> and   <a href=\"file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">faiss_indexer.py</span></a><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">:</span><a href=\"file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py#66\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">66</span></a>\n",
+       "<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">                    </span>         size <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1967771</span>                                                       <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">                   </span>\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[2;36m[01/05/24 13:54:18]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO    \u001b[0m Loaded index of type \u001b[1m<\u001b[0m\u001b[1;95mclass\u001b[0m\u001b[39m \u001b[0m\u001b[32m'faiss.swigfaiss.IndexHNSWFlat'\u001b[0m\u001b[1m>\u001b[0m and   \u001b]8;id=776198;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py\u001b\\\u001b[2mfaiss_indexer.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=995347;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py#66\u001b\\\u001b[2m66\u001b[0m\u001b]8;;\u001b\\\n",
+       "\u001b[2;36m                    \u001b[0m         size \u001b[1;36m1967771\u001b[0m                                                       \u001b[2m                   \u001b[0m\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b477d6652c9e4326b9d9ab5f651d24a8",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map:   0%|          | 0/2 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from xmen.linkers import default_ensemble\n",
+    "linker = default_ensemble(Path(conf.linker.candidate_generation.ngram.index_base_path).parent, cuda=False)\n",
+    "\n",
+    "candidates = linker.predict_batch(dataset)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "3728665d-03ae-47a6-82d6-3cf906b72c76",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from xmen import load_kb\n",
+    "\n",
+    "kb = load_kb(Path(conf.cache_dir) / 'snomed_german' / 'snomed_german.jsonl')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "c3a659c4-0371-4b6f-9544-8b1fc6615c61",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>mention</th>\n",
+       "      <th>class</th>\n",
+       "      <th>cui</th>\n",
+       "      <th>canonical name</th>\n",
+       "      <th>linked by</th>\n",
+       "      <th>score</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Cetuximab</td>\n",
+       "      <td>Clinical_Drug</td>\n",
+       "      <td>409401002</td>\n",
+       "      <td>Product containing cetuximab (medicinal product)</td>\n",
+       "      <td>[ngram, sapbert]</td>\n",
+       "      <td>1.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>monoklonaler Antikörper</td>\n",
+       "      <td>Clinical_Drug</td>\n",
+       "      <td>49616005</td>\n",
+       "      <td>Monoclonal antibody</td>\n",
+       "      <td>[ngram, sapbert]</td>\n",
+       "      <td>0.982318</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>epidermalen Wachstumsfaktorrezeptor</td>\n",
+       "      <td>Nutrient_or_Body_Substance</td>\n",
+       "      <td>86960007</td>\n",
+       "      <td>Epidermal growth factor-urogastrone receptor</td>\n",
+       "      <td>[sapbert]</td>\n",
+       "      <td>0.937485</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>EGFR</td>\n",
+       "      <td>Nutrient_or_Body_Substance</td>\n",
+       "      <td>86960007</td>\n",
+       "      <td>Epidermal growth factor-urogastrone receptor</td>\n",
+       "      <td>[ngram, sapbert]</td>\n",
+       "      <td>1.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Therapie des fortgeschrittenen kolorektalen Ka...</td>\n",
+       "      <td>Therapeutic</td>\n",
+       "      <td>1217692004</td>\n",
+       "      <td>Metastasis from malignant neoplasm of colon an...</td>\n",
+       "      <td>[ngram, sapbert]</td>\n",
+       "      <td>0.696480</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>fortgeschrittenen kolorektalen Karzinoms</td>\n",
+       "      <td>Diagnosis_or_Pathology</td>\n",
+       "      <td>1217692004</td>\n",
+       "      <td>Metastasis from malignant neoplasm of colon an...</td>\n",
+       "      <td>[ngram, sapbert]</td>\n",
+       "      <td>0.843411</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>Irinotecan</td>\n",
+       "      <td>Clinical_Drug</td>\n",
+       "      <td>372538008</td>\n",
+       "      <td>Irinotecan</td>\n",
+       "      <td>[ngram, sapbert]</td>\n",
+       "      <td>1.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>FOLFOX</td>\n",
+       "      <td>Therapeutic</td>\n",
+       "      <td>699297004</td>\n",
+       "      <td>Ohdo syndrome, Maat-Kievit-Brunner type</td>\n",
+       "      <td>[sapbert]</td>\n",
+       "      <td>0.812118</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>Versagen einer Behandlung</td>\n",
+       "      <td>Other_Finding</td>\n",
+       "      <td>7058009</td>\n",
+       "      <td>Noncompliance with treatment</td>\n",
+       "      <td>[ngram, sapbert]</td>\n",
+       "      <td>0.881669</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>Behandlung mit Oxaliplatin und Irinotecan</td>\n",
+       "      <td>Therapeutic</td>\n",
+       "      <td>447053005</td>\n",
+       "      <td>Oxaliplatin desensitization therapy</td>\n",
+       "      <td>[ngram, sapbert]</td>\n",
+       "      <td>0.673975</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>Oxaliplatin</td>\n",
+       "      <td>Clinical_Drug</td>\n",
+       "      <td>395814003</td>\n",
+       "      <td>Oxaliplatin</td>\n",
+       "      <td>[ngram, sapbert]</td>\n",
+       "      <td>1.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>Irinotecan</td>\n",
+       "      <td>Clinical_Drug</td>\n",
+       "      <td>372538008</td>\n",
+       "      <td>Irinotecan</td>\n",
+       "      <td>[ngram, sapbert]</td>\n",
+       "      <td>1.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>HPV-Diagnostik</td>\n",
+       "      <td>Diagnostic</td>\n",
+       "      <td>700152009</td>\n",
+       "      <td>Human papilloma virus screening</td>\n",
+       "      <td>[sapbert]</td>\n",
+       "      <td>0.913146</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>Plattenepithelkarzinom der Mundhöhle</td>\n",
+       "      <td>Diagnosis_or_Pathology</td>\n",
+       "      <td>307502000</td>\n",
+       "      <td>Squamous cell carcinoma of mouth</td>\n",
+       "      <td>[ngram, sapbert]</td>\n",
+       "      <td>0.987126</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                              mention  \\\n",
+       "0                                           Cetuximab   \n",
+       "1                             monoklonaler Antikörper   \n",
+       "2                 epidermalen Wachstumsfaktorrezeptor   \n",
+       "3                                                EGFR   \n",
+       "4   Therapie des fortgeschrittenen kolorektalen Ka...   \n",
+       "5            fortgeschrittenen kolorektalen Karzinoms   \n",
+       "6                                          Irinotecan   \n",
+       "7                                              FOLFOX   \n",
+       "8                           Versagen einer Behandlung   \n",
+       "9           Behandlung mit Oxaliplatin und Irinotecan   \n",
+       "10                                        Oxaliplatin   \n",
+       "11                                         Irinotecan   \n",
+       "12                                     HPV-Diagnostik   \n",
+       "13               Plattenepithelkarzinom der Mundhöhle   \n",
+       "\n",
+       "                         class         cui  \\\n",
+       "0                Clinical_Drug   409401002   \n",
+       "1                Clinical_Drug    49616005   \n",
+       "2   Nutrient_or_Body_Substance    86960007   \n",
+       "3   Nutrient_or_Body_Substance    86960007   \n",
+       "4                  Therapeutic  1217692004   \n",
+       "5       Diagnosis_or_Pathology  1217692004   \n",
+       "6                Clinical_Drug   372538008   \n",
+       "7                  Therapeutic   699297004   \n",
+       "8                Other_Finding     7058009   \n",
+       "9                  Therapeutic   447053005   \n",
+       "10               Clinical_Drug   395814003   \n",
+       "11               Clinical_Drug   372538008   \n",
+       "12                  Diagnostic   700152009   \n",
+       "13      Diagnosis_or_Pathology   307502000   \n",
+       "\n",
+       "                                       canonical name         linked by  \\\n",
+       "0    Product containing cetuximab (medicinal product)  [ngram, sapbert]   \n",
+       "1                                 Monoclonal antibody  [ngram, sapbert]   \n",
+       "2        Epidermal growth factor-urogastrone receptor         [sapbert]   \n",
+       "3        Epidermal growth factor-urogastrone receptor  [ngram, sapbert]   \n",
+       "4   Metastasis from malignant neoplasm of colon an...  [ngram, sapbert]   \n",
+       "5   Metastasis from malignant neoplasm of colon an...  [ngram, sapbert]   \n",
+       "6                                          Irinotecan  [ngram, sapbert]   \n",
+       "7             Ohdo syndrome, Maat-Kievit-Brunner type         [sapbert]   \n",
+       "8                        Noncompliance with treatment  [ngram, sapbert]   \n",
+       "9                 Oxaliplatin desensitization therapy  [ngram, sapbert]   \n",
+       "10                                        Oxaliplatin  [ngram, sapbert]   \n",
+       "11                                         Irinotecan  [ngram, sapbert]   \n",
+       "12                    Human papilloma virus screening         [sapbert]   \n",
+       "13                   Squamous cell carcinoma of mouth  [ngram, sapbert]   \n",
+       "\n",
+       "       score  \n",
+       "0   1.000000  \n",
+       "1   0.982318  \n",
+       "2   0.937485  \n",
+       "3   1.000000  \n",
+       "4   0.696480  \n",
+       "5   0.843411  \n",
+       "6   1.000000  \n",
+       "7   0.812118  \n",
+       "8   0.881669  \n",
+       "9   0.673975  \n",
+       "10  1.000000  \n",
+       "11  1.000000  \n",
+       "12  0.913146  \n",
+       "13  0.987126  "
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from util import get_dataframe\n",
+    "get_dataframe(candidates, kb)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5b8847b5-ff8f-40de-8a18-f2c2485daf7e",
+   "metadata": {},
+   "source": [
+    "## Re-Ranking"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "8185c65f-5645-4c38-a11a-a32a4f24a2a1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from xmen.reranking import CrossEncoderReranker"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "07d41590-f991-474a-9109-9b75d3184266",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Context length: 128\n",
+      "Use NIL values: True\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f48fee7d065841388efc407f3aeaeac5",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map:   0%|          | 0/2 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "fa32aa44068041bb97bd5ebfaaf77248",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/14 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "aff7ad16c32648c5afb548173e64dfa9",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/14 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "237c7aa368b6418fbf9d7b1bb34d9c8e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/14 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "ce_candidates = CrossEncoderReranker.prepare_data(candidates, None, kb, k=64)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "6613f881-9323-42c7-a63c-710c6a1d758c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "rr = CrossEncoderReranker.load(\"phlobo/xmen-de-ce-medmentions\", device=0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "3af8dfbf-14b1-4a47-b76b-14868cde55e6",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b9a550a3efd34c76a001844587b21087",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/14 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "bd911340658d487087b452f3e87c1d68",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map:   0%|          | 0/2 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5986edf5b65e4fd3a53ecb22d4728ace",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map:   0%|          | 0/2 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a7a6159291d24e42a63e6e0a647dc41f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map:   0%|          | 0/2 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "reranked = rr.rerank_batch(candidates, ce_candidates, k=64)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "0f95ab2d-5f3b-432c-9214-64316b3f1e99",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>mention</th>\n",
+       "      <th>class</th>\n",
+       "      <th>cui</th>\n",
+       "      <th>canonical name</th>\n",
+       "      <th>linked by</th>\n",
+       "      <th>score</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Cetuximab</td>\n",
+       "      <td>Clinical_Drug</td>\n",
+       "      <td>409400001</td>\n",
+       "      <td>Cetuximab</td>\n",
+       "      <td>[ngram, sapbert]</td>\n",
+       "      <td>0.040032</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>monoklonaler Antikörper</td>\n",
+       "      <td>Clinical_Drug</td>\n",
+       "      <td>49616005</td>\n",
+       "      <td>Monoclonal antibody</td>\n",
+       "      <td>[ngram, sapbert]</td>\n",
+       "      <td>0.043237</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>epidermalen Wachstumsfaktorrezeptor</td>\n",
+       "      <td>Nutrient_or_Body_Substance</td>\n",
+       "      <td>86960007</td>\n",
+       "      <td>Epidermal growth factor-urogastrone receptor</td>\n",
+       "      <td>[sapbert]</td>\n",
+       "      <td>0.022354</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>EGFR</td>\n",
+       "      <td>Nutrient_or_Body_Substance</td>\n",
+       "      <td>86960007</td>\n",
+       "      <td>Epidermal growth factor-urogastrone receptor</td>\n",
+       "      <td>[ngram, sapbert]</td>\n",
+       "      <td>0.033175</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Therapie des fortgeschrittenen kolorektalen Ka...</td>\n",
+       "      <td>Therapeutic</td>\n",
+       "      <td>1217692004</td>\n",
+       "      <td>Metastasis from malignant neoplasm of colon an...</td>\n",
+       "      <td>[ngram, sapbert]</td>\n",
+       "      <td>0.017036</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>fortgeschrittenen kolorektalen Karzinoms</td>\n",
+       "      <td>Diagnosis_or_Pathology</td>\n",
+       "      <td>126837005</td>\n",
+       "      <td>Tumor of large intestine</td>\n",
+       "      <td>[ngram, sapbert]</td>\n",
+       "      <td>0.017528</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>Irinotecan</td>\n",
+       "      <td>Clinical_Drug</td>\n",
+       "      <td>372538008</td>\n",
+       "      <td>Irinotecan</td>\n",
+       "      <td>[ngram, sapbert]</td>\n",
+       "      <td>0.042663</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>FOLFOX</td>\n",
+       "      <td>Therapeutic</td>\n",
+       "      <td>461391000124102</td>\n",
+       "      <td>Folfox protocol</td>\n",
+       "      <td>[ngram, sapbert]</td>\n",
+       "      <td>0.018860</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>Versagen einer Behandlung</td>\n",
+       "      <td>Other_Finding</td>\n",
+       "      <td>266721009</td>\n",
+       "      <td>Absent response to treatment</td>\n",
+       "      <td>[sapbert]</td>\n",
+       "      <td>0.017751</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>Behandlung mit Oxaliplatin und Irinotecan</td>\n",
+       "      <td>Therapeutic</td>\n",
+       "      <td>447053005</td>\n",
+       "      <td>Oxaliplatin desensitization therapy</td>\n",
+       "      <td>[ngram, sapbert]</td>\n",
+       "      <td>0.017154</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>Oxaliplatin</td>\n",
+       "      <td>Clinical_Drug</td>\n",
+       "      <td>395814003</td>\n",
+       "      <td>Oxaliplatin</td>\n",
+       "      <td>[ngram, sapbert]</td>\n",
+       "      <td>0.039662</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>Irinotecan</td>\n",
+       "      <td>Clinical_Drug</td>\n",
+       "      <td>372538008</td>\n",
+       "      <td>Irinotecan</td>\n",
+       "      <td>[ngram, sapbert]</td>\n",
+       "      <td>0.045797</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>HPV-Diagnostik</td>\n",
+       "      <td>Diagnostic</td>\n",
+       "      <td>700152009</td>\n",
+       "      <td>Human papilloma virus screening</td>\n",
+       "      <td>[sapbert]</td>\n",
+       "      <td>0.019670</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>Plattenepithelkarzinom der Mundhöhle</td>\n",
+       "      <td>Diagnosis_or_Pathology</td>\n",
+       "      <td>307502000</td>\n",
+       "      <td>Squamous cell carcinoma of mouth</td>\n",
+       "      <td>[ngram, sapbert]</td>\n",
+       "      <td>0.037824</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                              mention  \\\n",
+       "0                                           Cetuximab   \n",
+       "1                             monoklonaler Antikörper   \n",
+       "2                 epidermalen Wachstumsfaktorrezeptor   \n",
+       "3                                                EGFR   \n",
+       "4   Therapie des fortgeschrittenen kolorektalen Ka...   \n",
+       "5            fortgeschrittenen kolorektalen Karzinoms   \n",
+       "6                                          Irinotecan   \n",
+       "7                                              FOLFOX   \n",
+       "8                           Versagen einer Behandlung   \n",
+       "9           Behandlung mit Oxaliplatin und Irinotecan   \n",
+       "10                                        Oxaliplatin   \n",
+       "11                                         Irinotecan   \n",
+       "12                                     HPV-Diagnostik   \n",
+       "13               Plattenepithelkarzinom der Mundhöhle   \n",
+       "\n",
+       "                         class              cui  \\\n",
+       "0                Clinical_Drug        409400001   \n",
+       "1                Clinical_Drug         49616005   \n",
+       "2   Nutrient_or_Body_Substance         86960007   \n",
+       "3   Nutrient_or_Body_Substance         86960007   \n",
+       "4                  Therapeutic       1217692004   \n",
+       "5       Diagnosis_or_Pathology        126837005   \n",
+       "6                Clinical_Drug        372538008   \n",
+       "7                  Therapeutic  461391000124102   \n",
+       "8                Other_Finding        266721009   \n",
+       "9                  Therapeutic        447053005   \n",
+       "10               Clinical_Drug        395814003   \n",
+       "11               Clinical_Drug        372538008   \n",
+       "12                  Diagnostic        700152009   \n",
+       "13      Diagnosis_or_Pathology        307502000   \n",
+       "\n",
+       "                                       canonical name         linked by  \\\n",
+       "0                                           Cetuximab  [ngram, sapbert]   \n",
+       "1                                 Monoclonal antibody  [ngram, sapbert]   \n",
+       "2        Epidermal growth factor-urogastrone receptor         [sapbert]   \n",
+       "3        Epidermal growth factor-urogastrone receptor  [ngram, sapbert]   \n",
+       "4   Metastasis from malignant neoplasm of colon an...  [ngram, sapbert]   \n",
+       "5                            Tumor of large intestine  [ngram, sapbert]   \n",
+       "6                                          Irinotecan  [ngram, sapbert]   \n",
+       "7                                     Folfox protocol  [ngram, sapbert]   \n",
+       "8                        Absent response to treatment         [sapbert]   \n",
+       "9                 Oxaliplatin desensitization therapy  [ngram, sapbert]   \n",
+       "10                                        Oxaliplatin  [ngram, sapbert]   \n",
+       "11                                         Irinotecan  [ngram, sapbert]   \n",
+       "12                    Human papilloma virus screening         [sapbert]   \n",
+       "13                   Squamous cell carcinoma of mouth  [ngram, sapbert]   \n",
+       "\n",
+       "       score  \n",
+       "0   0.040032  \n",
+       "1   0.043237  \n",
+       "2   0.022354  \n",
+       "3   0.033175  \n",
+       "4   0.017036  \n",
+       "5   0.017528  \n",
+       "6   0.042663  \n",
+       "7   0.018860  \n",
+       "8   0.017751  \n",
+       "9   0.017154  \n",
+       "10  0.039662  \n",
+       "11  0.045797  \n",
+       "12  0.019670  \n",
+       "13  0.037824  "
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# After Re-ranking\n",
+    "get_dataframe(reranked, kb)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9ace9bf0-c0ef-4c9e-9bd6-6ecce3cf03b6",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "070c2a21-fe32-4b59-a8ac-1b97d9548e76",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python [conda env:xmen_notebooks]",
+   "language": "python",
+   "name": "conda-env-xmen_notebooks-py"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/README.md b/examples/README.md
new file mode 100644
index 0000000..d8f8a05
--- /dev/null
+++ b/examples/README.md
@@ -0,0 +1,13 @@
+# Examples
+
+|Link|Description|
+|---|---|
+|[01_BRONCO_German.ipynb](01_BRONCO_German.ipynb)|Candidate generation and supervised re-ranking using the BRONCO corpus.<br>Shows how you can configure multiple dictionaries in the same config file.|
+|[02_GGPONC_German.ipynb](02_GGPONC_German.ipynb)|Using a spaCy NER model with xMEN<br>Shows how to build a pipeline without labelled data using candidate generation, type filtering and pre-trained re-rankers|
+|[03_SNOMED_Linking_German.ipynb](03_SNOMED_Linking_German.ipynb)|Linking against codes in UMLS source vocabularies (here SNOMED CT)|
+| | |
+
+
+## Benchmarks
+
+More examples for configurations can be found in the [Benchmarks](../benchmarks) folder.
\ No newline at end of file
diff --git a/examples/Temp.ipynb b/examples/Temp.ipynb
deleted file mode 100644
index 40b541c..0000000
--- a/examples/Temp.ipynb
+++ /dev/null
@@ -1,216 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from xmen import evaluation\n",
-    "from xmen.data import make_document, Entity, Concept"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "gt = [\n",
-    "    make_document(\n",
-    "        [\n",
-    "            Entity([[11, 17]], \"entity\", concepts=[\n",
-    "                Concept(\"c1\", db_name=\"UMLS\"),\n",
-    "            ]),\n",
-    "        ]\n",
-    "    )\n",
-    "]\n",
-    "pred = [\n",
-    "    make_document(\n",
-    "        [\n",
-    "            Entity([[11, 17]], \"entity\", concepts=[\n",
-    "                Concept(\"c1\", db_name=\"UMLS\")\n",
-    "            ]),\n",
-    "            Entity([[11, 15]], \"entity\", concepts=[\n",
-    "                Concept(\"c1\", db_name=\"UMLS\")\n",
-    "            ]),\n",
-    "            Entity([[16, 17]], \"entity\", concepts=[\n",
-    "                Concept(\"c2\", db_name=\"UMLS\")\n",
-    "            ]),\n",
-    "        ]\n",
-    "    )\n",
-    "]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 37,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>_word_len</th>\n",
-       "      <th>_abbrev</th>\n",
-       "      <th>gt_start</th>\n",
-       "      <th>gt_end</th>\n",
-       "      <th>gt_text</th>\n",
-       "      <th>pred_start</th>\n",
-       "      <th>pred_end</th>\n",
-       "      <th>pred_text</th>\n",
-       "      <th>ner_match_type</th>\n",
-       "      <th>gold_concept</th>\n",
-       "      <th>gold_type</th>\n",
-       "      <th>pred_index</th>\n",
-       "      <th>pred_index_score</th>\n",
-       "      <th>pred_top</th>\n",
-       "      <th>pred_top_score</th>\n",
-       "      <th>corpus_id</th>\n",
-       "      <th>document_id</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>1.0</td>\n",
-       "      <td>False</td>\n",
-       "      <td>11.0</td>\n",
-       "      <td>17.0</td>\n",
-       "      <td>[entity]</td>\n",
-       "      <td>11</td>\n",
-       "      <td>15</td>\n",
-       "      <td>[entity]</td>\n",
-       "      <td>be</td>\n",
-       "      <td>{'db_id': 'c1', 'target_kb': 'UMLS', 'type': N...</td>\n",
-       "      <td>None</td>\n",
-       "      <td>0</td>\n",
-       "      <td>None</td>\n",
-       "      <td>c1</td>\n",
-       "      <td>None</td>\n",
-       "      <td>x</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>1.0</td>\n",
-       "      <td>False</td>\n",
-       "      <td>11.0</td>\n",
-       "      <td>17.0</td>\n",
-       "      <td>[entity]</td>\n",
-       "      <td>11</td>\n",
-       "      <td>17</td>\n",
-       "      <td>[entity]</td>\n",
-       "      <td>be</td>\n",
-       "      <td>{'db_id': 'c1', 'target_kb': 'UMLS', 'type': N...</td>\n",
-       "      <td>None</td>\n",
-       "      <td>0</td>\n",
-       "      <td>None</td>\n",
-       "      <td>c1</td>\n",
-       "      <td>None</td>\n",
-       "      <td>x</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>NaN</td>\n",
-       "      <td>None</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>None</td>\n",
-       "      <td>16</td>\n",
-       "      <td>17</td>\n",
-       "      <td>[entity]</td>\n",
-       "      <td>fp</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>-1</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>x</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "   _word_len _abbrev  gt_start  gt_end   gt_text  pred_start  pred_end  \\\n",
-       "0        1.0   False      11.0    17.0  [entity]          11        15   \n",
-       "1        1.0   False      11.0    17.0  [entity]          11        17   \n",
-       "2        NaN    None       NaN     NaN      None          16        17   \n",
-       "\n",
-       "  pred_text ner_match_type                                       gold_concept  \\\n",
-       "0  [entity]             be  {'db_id': 'c1', 'target_kb': 'UMLS', 'type': N...   \n",
-       "1  [entity]             be  {'db_id': 'c1', 'target_kb': 'UMLS', 'type': N...   \n",
-       "2  [entity]             fp                                               None   \n",
-       "\n",
-       "  gold_type  pred_index pred_index_score pred_top pred_top_score corpus_id  \\\n",
-       "0      None           0             None       c1           None         x   \n",
-       "1      None           0             None       c1           None         x   \n",
-       "2      None          -1             None     None           None         x   \n",
-       "\n",
-       "  document_id  \n",
-       "0           1  \n",
-       "1           1  \n",
-       "2           1  "
-      ]
-     },
-     "execution_count": 37,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "from xmen.evaluation import error_analysis\n",
-    "error_analysis(gt, pred)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "xmen_notebooks",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.11"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/examples/conf/meddra_german.yaml b/examples/conf/meddra_german.yaml
index c20292a..552246f 100644
--- a/examples/conf/meddra_german.yaml
+++ b/examples/conf/meddra_german.yaml
@@ -3,6 +3,12 @@ name: meddra_german
 dict:
   custom:    
     umls_meta_path: ${oc.env:UMLS_HOME}/2023AA/META
+    id_key: SDUI
     sabs:
         - MDR
-        - MDRGER
\ No newline at end of file
+        - MDRGER
+    # Un-comment to add aliases from other UMLS vocabularies and languages
+    #umls_extend:
+    #  lang: 
+    #    - de
+    #    - en
\ No newline at end of file
diff --git a/examples/conf/snomed_german.yaml b/examples/conf/snomed_german.yaml
new file mode 100644
index 0000000..e31d025
--- /dev/null
+++ b/examples/conf/snomed_german.yaml
@@ -0,0 +1,24 @@
+name: snomed_german
+
+cache_dir: ${oc.env:HOME}/.cache/xmen/
+
+dict:
+  custom:    
+    umls_meta_path: ${oc.env:UMLS_HOME}/2023AA/META
+    id_key: SCUI
+    sabs:
+        - SNOMEDCT_US
+    umls_extend:
+        lang: 
+          - de
+          - en
+
+linker:
+    candidate_generation:
+        k: 64
+        ngram:
+          index_base_path: ${cache_dir}/${name}/index/ngrams/
+          k: 3
+        sapbert:
+          index_base_path: ${cache_dir}/${name}/index/sapbert
+          k: 3
\ No newline at end of file
diff --git a/examples/dicts/umls_source.py b/examples/dicts/umls_source.py
index 3755741..a32222d 100644
--- a/examples/dicts/umls_source.py
+++ b/examples/dicts/umls_source.py
@@ -1,25 +1,54 @@
 from tqdm.auto import tqdm
-from xmen.umls import read_umls_file_headers
-
+from xmen.umls import read_umls_file_headers, get_umls_concepts
+from collections import defaultdict
+from xmen.log import logger
 
 def get_concept_details(cfg):
     mrconso = "MRCONSO.RRF"
     concept_details = {}
     meta_path = cfg.dict.custom.umls_meta_path
     sabs = cfg.dict.custom.sabs
+    id_key = cfg.dict.custom.get("id_key")
 
     headers = read_umls_file_headers(meta_path, mrconso)
 
+    scui2cui = defaultdict(list)
+
     with open(f"{meta_path}/{mrconso}") as fin:
         for line in tqdm(fin.readlines()):
             splits = line.strip().split("|")
             assert len(headers) == len(splits)
             concept = dict(zip(headers, splits))
             if concept["SAB"] in sabs:
-                sid = concept["SDUI"]
+                cui = concept["CUI"]
+                if id_key:
+                    sid = concept[id_key]
+                else:                    
+                    sid = concept["SDUI"]
+                    if not sid:
+                        sid = concept["SCUI"]
+                if not sid:
+                    logger.warn(f"Skipping concept with CUI {cui} because we could not find a valid source vocabulary ID")
                 name = concept["STR"]
                 if sid in concept_details:
                     concept_details[sid]["aliases"].append(name)
                 else:
                     concept_details[sid] = {"concept_id": sid, "canonical_name": name, "types": [], "aliases": []}
+                scui2cui[sid].append(cui)
+
+    if umls_extend := cfg.dict.custom.get('umls_extend'):
+        # Optionally extend with UMLS synonyms
+        other_umls_concepts = get_umls_concepts(meta_path, 
+            umls_extend.get("lang"), sabs=umls_extend.get("sabs"), sources=umls_extend.get("sources"), semantic_groups=umls_extend.get("semantic_groups"), semantic_types=umls_extend.get("semantic_types"))
+
+        for scui, concept in tqdm(concept_details.items()):
+            for cui in scui2cui[scui]:
+                if cui in other_umls_concepts:
+                    for t in other_umls_concepts[cui]['types']:
+                        if not t in concept["types"]:
+                            concept["types"].append(t)
+                    for new_alias in other_umls_concepts[cui]["aliases"] + [other_umls_concepts[cui]["canonical_name"]]:
+                        if new_alias not in concept["aliases"] and new_alias != concept["canonical_name"]:
+                            concept["aliases"].append(new_alias)                    
+    
     return concept_details
diff --git a/examples/ggponc2tui.csv b/examples/ggponc2tui.csv
deleted file mode 100644
index 2264706..0000000
--- a/examples/ggponc2tui.csv
+++ /dev/null
@@ -1,128 +0,0 @@
-Group,Group Long,TUI,Type Name,Diagnosis_or_Pathology,Other_Finding,Clinical_Drug,Nutrient_or_Body_Substance,External_Substance,Therapeutic,Diagnostic
-ACTI,Activities & Behaviors,T052,Activity,x,x,,,,x,x
-ACTI,Activities & Behaviors,T053,Behavior,x,x,,,,,
-ACTI,Activities & Behaviors,T056,Daily or Recreational Activity,x,x,,,,,
-ACTI,Activities & Behaviors,T051,Event,x,x,,,,,
-ACTI,Activities & Behaviors,T064,Governmental or Regulatory Activity,x,x,,,,x,x
-ACTI,Activities & Behaviors,T055,Individual Behavior,x,x,,,,,
-ACTI,Activities & Behaviors,T066,Machine Activity,x,x,,,,x,x
-ACTI,Activities & Behaviors,T057,Occupational Activity,x,x,,,,x,x
-ACTI,Activities & Behaviors,T054,Social Behavior,x,x,,,,,
-ANAT,Anatomy,T017,Anatomical Structure,x,x,,x,,,
-ANAT,Anatomy,T029,Body Location or Region,x,x,,x,,,
-ANAT,Anatomy,T023,"Body Part, Organ, or Organ Component",x,x,,x,,,
-ANAT,Anatomy,T030,Body Space or Junction,x,x,,x,,,
-ANAT,Anatomy,T031,Body Substance,x,x,,x,,,
-ANAT,Anatomy,T022,Body System,x,x,,x,,,
-ANAT,Anatomy,T025,Cell,x,x,,x,,,
-ANAT,Anatomy,T026,Cell Component,x,x,,x,,,
-ANAT,Anatomy,T018,Embryonic Structure,x,x,,x,,,
-ANAT,Anatomy,T021,Fully Formed Anatomical Structure,x,x,,x,,,
-ANAT,Anatomy,T024,Tissue,x,x,,x,,,
-CHEM,Chemicals & Drugs,T116,"Amino Acid, Peptide, or Protein",,,x,x,x,x,x
-CHEM,Chemicals & Drugs,T195,Antibiotic,,,x,x,x,x,x
-CHEM,Chemicals & Drugs,T123,Biologically Active Substance,x,x,x,x,x,x,x
-CHEM,Chemicals & Drugs,T122,Biomedical or Dental Material,x,x,x,x,x,x,x
-CHEM,Chemicals & Drugs,T103,Chemical,x,x,x,x,x,x,x
-CHEM,Chemicals & Drugs,T120,Chemical Viewed Functionally,x,x,x,x,x,x,x
-CHEM,Chemicals & Drugs,T104,Chemical Viewed Structurally,x,x,x,x,x,x,x
-CHEM,Chemicals & Drugs,T200,Clinical Drug,x,x,x,x,x,x,x
-CHEM,Chemicals & Drugs,T196,"Element, Ion, or Isotope",x,x,x,x,x,x,x
-CHEM,Chemicals & Drugs,T126,Enzyme,x,x,x,x,x,x,x
-CHEM,Chemicals & Drugs,T131,Hazardous or Poisonous Substance,x,x,x,x,x,x,x
-CHEM,Chemicals & Drugs,T125,Hormone,x,x,x,x,x,x,x
-CHEM,Chemicals & Drugs,T129,Immunologic Factor,x,x,x,x,x,x,x
-CHEM,Chemicals & Drugs,T130,"Indicator, Reagent, or Diagnostic Aid",x,x,x,x,x,x,x
-CHEM,Chemicals & Drugs,T197,Inorganic Chemical,x,x,x,x,x,x,x
-CHEM,Chemicals & Drugs,T114,"Nucleic Acid, Nucleoside, or Nucleotide",x,x,x,x,x,x,x
-CHEM,Chemicals & Drugs,T109,Organic Chemical,x,x,x,x,x,x,x
-CHEM,Chemicals & Drugs,T121,Pharmacologic Substance,x,x,x,x,x,x,x
-CHEM,Chemicals & Drugs,T192,Receptor,x,x,x,x,x,x,x
-CHEM,Chemicals & Drugs,T127,Vitamin,x,x,x,x,x,x,x
-CONC,Concepts & Ideas,T185,Classification,,x,,,,,
-CONC,Concepts & Ideas,T077,Conceptual Entity,,x,,,,,
-CONC,Concepts & Ideas,T169,Functional Concept,,x,,,,,
-CONC,Concepts & Ideas,T102,Group Attribute,,x,,,,,
-CONC,Concepts & Ideas,T078,Idea or Concept,,x,,,,,
-CONC,Concepts & Ideas,T170,Intellectual Product,,,,,,,
-CONC,Concepts & Ideas,T171,Language,,x,,,,,
-CONC,Concepts & Ideas,T080,Qualitative Concept,,x,,,,,
-CONC,Concepts & Ideas,T081,Quantitative Concept,,x,,,,,
-CONC,Concepts & Ideas,T089,Regulation or Law,,,,,,,
-CONC,Concepts & Ideas,T082,Spatial Concept,,x,,,,,
-CONC,Concepts & Ideas,T079,Temporal Concept,,x,,,,,
-DEVI,Devices,T203,Drug Delivery Device,,,x,,,x,x
-DEVI,Devices,T074,Medical Device,,,x,,,x,x
-DEVI,Devices,T075,Research Device,,,x,,,x,x
-DISO,Disorders,T020,Acquired Abnormality,x,x,,,,,
-DISO,Disorders,T190,Anatomical Abnormality,x,x,,,,,
-DISO,Disorders,T049,Cell or Molecular Dysfunction,x,x,,,,,
-DISO,Disorders,T019,Congenital Abnormality,x,x,,,,,
-DISO,Disorders,T047,Disease or Syndrome,x,x,,,,,
-DISO,Disorders,T050,Experimental Model of Disease,x,x,,,,,
-DISO,Disorders,T033,Finding,x,x,,,,,
-DISO,Disorders,T037,Injury or Poisoning,x,x,,,x,,
-DISO,Disorders,T048,Mental or Behavioral Dysfunction,x,x,,,,,
-DISO,Disorders,T191,Neoplastic Process,x,x,,,,,
-DISO,Disorders,T046,Pathologic Function,x,x,,,,,
-DISO,Disorders,T184,Sign or Symptom,x,x,,,,,
-GENE,Genes & Molecular Sequences,T087,Amino Acid Sequence,x,x,x,x,x,,
-GENE,Genes & Molecular Sequences,T088,Carbohydrate Sequence,x,x,x,x,x,,
-GENE,Genes & Molecular Sequences,T028,Gene or Genome,x,x,x,x,x,,
-GENE,Genes & Molecular Sequences,T085,Molecular Sequence,x,x,x,x,x,,
-GENE,Genes & Molecular Sequences,T086,Nucleotide Sequence,x,x,x,x,x,,
-GEOG,Geographic Areas,T083,Geographic Area,x,x,x,x,x,,
-LIVB,Living Beings,T100,Age Group,,x,,,,,
-LIVB,Living Beings,T011,Amphibian,,x,,,,,
-LIVB,Living Beings,T008,Animal,,x,,,,,
-LIVB,Living Beings,T194,Archaeon,,x,,,,,
-LIVB,Living Beings,T007,Bacterium,x,x,,x,,,
-LIVB,Living Beings,T012,Bird,,x,,,,,
-LIVB,Living Beings,T204,Eukaryote,x,x,,x,,,
-LIVB,Living Beings,T099,Family Group,,x,,,,,
-LIVB,Living Beings,T013,Fish,,x,,,,,
-LIVB,Living Beings,T004,Fungus,x,x,,x,,,
-LIVB,Living Beings,T096,Group,,x,,,,,
-LIVB,Living Beings,T016,Human,,x,,,,,
-LIVB,Living Beings,T015,Mammal,,x,,,,,
-LIVB,Living Beings,T001,Organism,,x,,,,,
-LIVB,Living Beings,T101,Patient or Disabled Group,x,x,,,,,
-LIVB,Living Beings,T002,Plant,,x,,,,,
-LIVB,Living Beings,T098,Population Group,,x,,,,,
-LIVB,Living Beings,T097,Professional or Occupational Group,,x,,,,x,x
-LIVB,Living Beings,T014,Reptile,,x,,,,,
-LIVB,Living Beings,T010,Vertebrate,,x,,,,,
-LIVB,Living Beings,T005,Virus,x,x,,,,,
-OBJC,Objects,T071,Entity,x,x,x,x,x,x,x
-OBJC,Objects,T168,Food,x,x,x,x,x,x,x
-OBJC,Objects,T073,Manufactured Object,x,x,x,x,x,x,x
-OBJC,Objects,T072,Physical Object,x,x,x,x,x,x,x
-OBJC,Objects,T167,Substance,x,x,x,x,x,x,x
-OCCU,Occupations,T091,Biomedical Occupation or Discipline,x,x,,,,x,x
-OCCU,Occupations,T090,Occupation or Discipline,x,x,,,,x,x
-ORGA,Organizations,T093,Health Care Related Organization,,x,,,,x,x
-ORGA,Organizations,T092,Organization,,x,,,,x,x
-ORGA,Organizations,T094,Professional Society,,x,,,,x,x
-ORGA,Organizations,T095,Self-help or Relief Organization,,x,,,,x,x
-PHEN,Phenomena,T038,Biologic Function,x,x,,,,x,x
-PHEN,Phenomena,T069,Environmental Effect of Humans,x,x,,,,x,x
-PHEN,Phenomena,T068,Human-caused Phenomenon or Process,x,x,,,,x,x
-PHEN,Phenomena,T034,Laboratory or Test Result,x,x,,,,x,x
-PHEN,Phenomena,T070,Natural Phenomenon or Process,x,x,,,,x,x
-PHEN,Phenomena,T067,Phenomenon or Process,x,x,,,,x,x
-PHYS,Physiology,T043,Cell Function,x,x,,x,,,
-PHYS,Physiology,T201,Clinical Attribute,x,x,,,,,
-PHYS,Physiology,T045,Genetic Function,x,x,,x,,,
-PHYS,Physiology,T041,Mental Process,x,x,,,,,
-PHYS,Physiology,T044,Molecular Function,x,x,,x,,,
-PHYS,Physiology,T032,Organism Attribute,x,x,,,,,
-PHYS,Physiology,T040,Organism Function,x,x,,,,,
-PHYS,Physiology,T042,Organ or Tissue Function,x,x,,,,,
-PHYS,Physiology,T039,Physiologic Function,x,x,,,,,
-PROC,Procedures,T060,Diagnostic Procedure,,,,,,x,x
-PROC,Procedures,T065,Educational Activity,,,,,,x,
-PROC,Procedures,T058,Health Care Activity,,,,,,x,x
-PROC,Procedures,T059,Laboratory Procedure,,,,,,x,x
-PROC,Procedures,T063,Molecular Biology Research Technique,,,,,,x,x
-PROC,Procedures,T062,Research Activity,,,,,,x,x
-PROC,Procedures,T061,Therapeutic or Preventive Procedure,,,,,,x,x
\ No newline at end of file
diff --git a/examples/ggponc_tuis.csv b/examples/ggponc_tuis.csv
new file mode 100644
index 0000000..2ee4ab5
--- /dev/null
+++ b/examples/ggponc_tuis.csv
@@ -0,0 +1,414 @@
+class,tui
+Diagnosis_or_Pathology,T052
+Diagnosis_or_Pathology,T053
+Diagnosis_or_Pathology,T056
+Diagnosis_or_Pathology,T051
+Diagnosis_or_Pathology,T064
+Diagnosis_or_Pathology,T055
+Diagnosis_or_Pathology,T066
+Diagnosis_or_Pathology,T057
+Diagnosis_or_Pathology,T054
+Diagnosis_or_Pathology,T017
+Diagnosis_or_Pathology,T029
+Diagnosis_or_Pathology,T023
+Diagnosis_or_Pathology,T030
+Diagnosis_or_Pathology,T031
+Diagnosis_or_Pathology,T022
+Diagnosis_or_Pathology,T025
+Diagnosis_or_Pathology,T026
+Diagnosis_or_Pathology,T018
+Diagnosis_or_Pathology,T021
+Diagnosis_or_Pathology,T024
+Diagnosis_or_Pathology,T123
+Diagnosis_or_Pathology,T122
+Diagnosis_or_Pathology,T103
+Diagnosis_or_Pathology,T120
+Diagnosis_or_Pathology,T104
+Diagnosis_or_Pathology,T200
+Diagnosis_or_Pathology,T196
+Diagnosis_or_Pathology,T126
+Diagnosis_or_Pathology,T131
+Diagnosis_or_Pathology,T125
+Diagnosis_or_Pathology,T129
+Diagnosis_or_Pathology,T130
+Diagnosis_or_Pathology,T197
+Diagnosis_or_Pathology,T114
+Diagnosis_or_Pathology,T109
+Diagnosis_or_Pathology,T121
+Diagnosis_or_Pathology,T192
+Diagnosis_or_Pathology,T127
+Diagnosis_or_Pathology,T020
+Diagnosis_or_Pathology,T190
+Diagnosis_or_Pathology,T049
+Diagnosis_or_Pathology,T019
+Diagnosis_or_Pathology,T047
+Diagnosis_or_Pathology,T050
+Diagnosis_or_Pathology,T033
+Diagnosis_or_Pathology,T037
+Diagnosis_or_Pathology,T048
+Diagnosis_or_Pathology,T191
+Diagnosis_or_Pathology,T046
+Diagnosis_or_Pathology,T184
+Diagnosis_or_Pathology,T087
+Diagnosis_or_Pathology,T088
+Diagnosis_or_Pathology,T028
+Diagnosis_or_Pathology,T085
+Diagnosis_or_Pathology,T086
+Diagnosis_or_Pathology,T083
+Diagnosis_or_Pathology,T007
+Diagnosis_or_Pathology,T204
+Diagnosis_or_Pathology,T004
+Diagnosis_or_Pathology,T101
+Diagnosis_or_Pathology,T005
+Diagnosis_or_Pathology,T071
+Diagnosis_or_Pathology,T168
+Diagnosis_or_Pathology,T073
+Diagnosis_or_Pathology,T072
+Diagnosis_or_Pathology,T167
+Diagnosis_or_Pathology,T091
+Diagnosis_or_Pathology,T090
+Diagnosis_or_Pathology,T038
+Diagnosis_or_Pathology,T069
+Diagnosis_or_Pathology,T068
+Diagnosis_or_Pathology,T034
+Diagnosis_or_Pathology,T070
+Diagnosis_or_Pathology,T067
+Diagnosis_or_Pathology,T043
+Diagnosis_or_Pathology,T201
+Diagnosis_or_Pathology,T045
+Diagnosis_or_Pathology,T041
+Diagnosis_or_Pathology,T044
+Diagnosis_or_Pathology,T032
+Diagnosis_or_Pathology,T040
+Diagnosis_or_Pathology,T042
+Diagnosis_or_Pathology,T039
+Other_Finding,T052
+Other_Finding,T053
+Other_Finding,T056
+Other_Finding,T051
+Other_Finding,T064
+Other_Finding,T055
+Other_Finding,T066
+Other_Finding,T057
+Other_Finding,T054
+Other_Finding,T017
+Other_Finding,T029
+Other_Finding,T023
+Other_Finding,T030
+Other_Finding,T031
+Other_Finding,T022
+Other_Finding,T025
+Other_Finding,T026
+Other_Finding,T018
+Other_Finding,T021
+Other_Finding,T024
+Other_Finding,T123
+Other_Finding,T122
+Other_Finding,T103
+Other_Finding,T120
+Other_Finding,T104
+Other_Finding,T200
+Other_Finding,T196
+Other_Finding,T126
+Other_Finding,T131
+Other_Finding,T125
+Other_Finding,T129
+Other_Finding,T130
+Other_Finding,T197
+Other_Finding,T114
+Other_Finding,T109
+Other_Finding,T121
+Other_Finding,T192
+Other_Finding,T127
+Other_Finding,T185
+Other_Finding,T077
+Other_Finding,T169
+Other_Finding,T102
+Other_Finding,T078
+Other_Finding,T171
+Other_Finding,T080
+Other_Finding,T081
+Other_Finding,T082
+Other_Finding,T079
+Other_Finding,T020
+Other_Finding,T190
+Other_Finding,T049
+Other_Finding,T019
+Other_Finding,T047
+Other_Finding,T050
+Other_Finding,T033
+Other_Finding,T037
+Other_Finding,T048
+Other_Finding,T191
+Other_Finding,T046
+Other_Finding,T184
+Other_Finding,T087
+Other_Finding,T088
+Other_Finding,T028
+Other_Finding,T085
+Other_Finding,T086
+Other_Finding,T083
+Other_Finding,T100
+Other_Finding,T011
+Other_Finding,T008
+Other_Finding,T194
+Other_Finding,T007
+Other_Finding,T012
+Other_Finding,T204
+Other_Finding,T099
+Other_Finding,T013
+Other_Finding,T004
+Other_Finding,T096
+Other_Finding,T016
+Other_Finding,T015
+Other_Finding,T001
+Other_Finding,T101
+Other_Finding,T002
+Other_Finding,T098
+Other_Finding,T097
+Other_Finding,T014
+Other_Finding,T010
+Other_Finding,T005
+Other_Finding,T071
+Other_Finding,T168
+Other_Finding,T073
+Other_Finding,T072
+Other_Finding,T167
+Other_Finding,T091
+Other_Finding,T090
+Other_Finding,T093
+Other_Finding,T092
+Other_Finding,T094
+Other_Finding,T095
+Other_Finding,T038
+Other_Finding,T069
+Other_Finding,T068
+Other_Finding,T034
+Other_Finding,T070
+Other_Finding,T067
+Other_Finding,T043
+Other_Finding,T201
+Other_Finding,T045
+Other_Finding,T041
+Other_Finding,T044
+Other_Finding,T032
+Other_Finding,T040
+Other_Finding,T042
+Other_Finding,T039
+Clinical_Drug,T116
+Clinical_Drug,T195
+Clinical_Drug,T123
+Clinical_Drug,T122
+Clinical_Drug,T103
+Clinical_Drug,T120
+Clinical_Drug,T104
+Clinical_Drug,T200
+Clinical_Drug,T196
+Clinical_Drug,T126
+Clinical_Drug,T131
+Clinical_Drug,T125
+Clinical_Drug,T129
+Clinical_Drug,T130
+Clinical_Drug,T197
+Clinical_Drug,T114
+Clinical_Drug,T109
+Clinical_Drug,T121
+Clinical_Drug,T192
+Clinical_Drug,T127
+Clinical_Drug,T203
+Clinical_Drug,T074
+Clinical_Drug,T075
+Clinical_Drug,T087
+Clinical_Drug,T088
+Clinical_Drug,T028
+Clinical_Drug,T085
+Clinical_Drug,T086
+Clinical_Drug,T083
+Clinical_Drug,T071
+Clinical_Drug,T168
+Clinical_Drug,T073
+Clinical_Drug,T072
+Clinical_Drug,T167
+Nutrient_or_Body_Substance,T017
+Nutrient_or_Body_Substance,T029
+Nutrient_or_Body_Substance,T023
+Nutrient_or_Body_Substance,T030
+Nutrient_or_Body_Substance,T031
+Nutrient_or_Body_Substance,T022
+Nutrient_or_Body_Substance,T025
+Nutrient_or_Body_Substance,T026
+Nutrient_or_Body_Substance,T018
+Nutrient_or_Body_Substance,T021
+Nutrient_or_Body_Substance,T024
+Nutrient_or_Body_Substance,T116
+Nutrient_or_Body_Substance,T195
+Nutrient_or_Body_Substance,T123
+Nutrient_or_Body_Substance,T122
+Nutrient_or_Body_Substance,T103
+Nutrient_or_Body_Substance,T120
+Nutrient_or_Body_Substance,T104
+Nutrient_or_Body_Substance,T200
+Nutrient_or_Body_Substance,T196
+Nutrient_or_Body_Substance,T126
+Nutrient_or_Body_Substance,T131
+Nutrient_or_Body_Substance,T125
+Nutrient_or_Body_Substance,T129
+Nutrient_or_Body_Substance,T130
+Nutrient_or_Body_Substance,T197
+Nutrient_or_Body_Substance,T114
+Nutrient_or_Body_Substance,T109
+Nutrient_or_Body_Substance,T121
+Nutrient_or_Body_Substance,T192
+Nutrient_or_Body_Substance,T127
+Nutrient_or_Body_Substance,T087
+Nutrient_or_Body_Substance,T088
+Nutrient_or_Body_Substance,T028
+Nutrient_or_Body_Substance,T085
+Nutrient_or_Body_Substance,T086
+Nutrient_or_Body_Substance,T083
+Nutrient_or_Body_Substance,T007
+Nutrient_or_Body_Substance,T204
+Nutrient_or_Body_Substance,T004
+Nutrient_or_Body_Substance,T071
+Nutrient_or_Body_Substance,T168
+Nutrient_or_Body_Substance,T073
+Nutrient_or_Body_Substance,T072
+Nutrient_or_Body_Substance,T167
+Nutrient_or_Body_Substance,T043
+Nutrient_or_Body_Substance,T045
+Nutrient_or_Body_Substance,T044
+External_Substance,T116
+External_Substance,T195
+External_Substance,T123
+External_Substance,T122
+External_Substance,T103
+External_Substance,T120
+External_Substance,T104
+External_Substance,T200
+External_Substance,T196
+External_Substance,T126
+External_Substance,T131
+External_Substance,T125
+External_Substance,T129
+External_Substance,T130
+External_Substance,T197
+External_Substance,T114
+External_Substance,T109
+External_Substance,T121
+External_Substance,T192
+External_Substance,T127
+External_Substance,T037
+External_Substance,T087
+External_Substance,T088
+External_Substance,T028
+External_Substance,T085
+External_Substance,T086
+External_Substance,T083
+External_Substance,T071
+External_Substance,T168
+External_Substance,T073
+External_Substance,T072
+External_Substance,T167
+Therapeutic,T052
+Therapeutic,T064
+Therapeutic,T066
+Therapeutic,T057
+Therapeutic,T116
+Therapeutic,T195
+Therapeutic,T123
+Therapeutic,T122
+Therapeutic,T103
+Therapeutic,T120
+Therapeutic,T104
+Therapeutic,T200
+Therapeutic,T196
+Therapeutic,T126
+Therapeutic,T131
+Therapeutic,T125
+Therapeutic,T129
+Therapeutic,T130
+Therapeutic,T197
+Therapeutic,T114
+Therapeutic,T109
+Therapeutic,T121
+Therapeutic,T192
+Therapeutic,T127
+Therapeutic,T203
+Therapeutic,T074
+Therapeutic,T075
+Therapeutic,T097
+Therapeutic,T071
+Therapeutic,T168
+Therapeutic,T073
+Therapeutic,T072
+Therapeutic,T167
+Therapeutic,T091
+Therapeutic,T090
+Therapeutic,T093
+Therapeutic,T092
+Therapeutic,T094
+Therapeutic,T095
+Therapeutic,T038
+Therapeutic,T069
+Therapeutic,T068
+Therapeutic,T034
+Therapeutic,T070
+Therapeutic,T067
+Therapeutic,T060
+Therapeutic,T065
+Therapeutic,T058
+Therapeutic,T059
+Therapeutic,T063
+Therapeutic,T062
+Therapeutic,T061
+Diagnostic,T052
+Diagnostic,T064
+Diagnostic,T066
+Diagnostic,T057
+Diagnostic,T116
+Diagnostic,T195
+Diagnostic,T123
+Diagnostic,T122
+Diagnostic,T103
+Diagnostic,T120
+Diagnostic,T104
+Diagnostic,T200
+Diagnostic,T196
+Diagnostic,T126
+Diagnostic,T131
+Diagnostic,T125
+Diagnostic,T129
+Diagnostic,T130
+Diagnostic,T197
+Diagnostic,T114
+Diagnostic,T109
+Diagnostic,T121
+Diagnostic,T192
+Diagnostic,T127
+Diagnostic,T203
+Diagnostic,T074
+Diagnostic,T075
+Diagnostic,T097
+Diagnostic,T071
+Diagnostic,T168
+Diagnostic,T073
+Diagnostic,T072
+Diagnostic,T167
+Diagnostic,T091
+Diagnostic,T090
+Diagnostic,T093
+Diagnostic,T092
+Diagnostic,T094
+Diagnostic,T095
+Diagnostic,T038
+Diagnostic,T069
+Diagnostic,T068
+Diagnostic,T034
+Diagnostic,T070
+Diagnostic,T067
+Diagnostic,T060
+Diagnostic,T058
+Diagnostic,T059
+Diagnostic,T063
+Diagnostic,T062
+Diagnostic,T061
diff --git a/examples/conf/distemist.yaml b/examples/old_examples/conf/distemist.yaml
similarity index 100%
rename from examples/conf/distemist.yaml
rename to examples/old_examples/conf/distemist.yaml
diff --git a/examples/01_BioASQ_DisTEMIST.ipynb b/examples/old_examples/distemist/01_BioASQ_DisTEMIST.ipynb
similarity index 100%
rename from examples/01_BioASQ_DisTEMIST.ipynb
rename to examples/old_examples/distemist/01_BioASQ_DisTEMIST.ipynb
diff --git a/examples/dicts/distemist.py b/examples/old_examples/distemist/distemist.py
similarity index 100%
rename from examples/dicts/distemist.py
rename to examples/old_examples/distemist/distemist.py
diff --git a/examples/distemist_bioasq.yaml b/examples/old_examples/distemist/distemist_bioasq.yaml
similarity index 100%
rename from examples/distemist_bioasq.yaml
rename to examples/old_examples/distemist/distemist_bioasq.yaml
diff --git a/examples/notebook_util.py b/examples/old_examples/distemist/notebook_util.py
similarity index 100%
rename from examples/notebook_util.py
rename to examples/old_examples/distemist/notebook_util.py
diff --git a/examples/util.py b/examples/util.py
new file mode 100644
index 0000000..a62927f
--- /dev/null
+++ b/examples/util.py
@@ -0,0 +1,15 @@
+import pandas as pd
+
+def get_dataframe(predictions, kb):
+    ents = []
+    for d in predictions:
+        for e in d['entities']:
+            span = ' '.join(e['text'])
+            label = e['type']
+            top_concept = e['normalized'][0] if len(e['normalized']) > 0 else None        
+            if top_concept:
+                cui = top_concept['db_id']
+                ents.append({'mention' : span, 'class' :  label, 'cui' : cui, 'canonical name' : kb.cui_to_entity[cui].canonical_name, 'linked by' : top_concept['predicted_by'], 'score' : top_concept['score']})
+            else:
+                ents.append({'mention' : span, 'class' :  label, 'cui' : 'Not linkable'})
+    return pd.DataFrame(ents)
\ No newline at end of file

From b74a252667a7313f53cb11595d870e415d624ee0 Mon Sep 17 00:00:00 2001
From: Florian Borchert <Florian.Borchert@hpi.de>
Date: Fri, 5 Jan 2024 14:09:04 +0100
Subject: [PATCH 2/7] Readme

---
 README.md                               |  4 +++
 examples/03_SNOMED_Linking_German.ipynb | 36 ++++++++++---------------
 examples/README.md                      | 23 ++++++++++++----
 3 files changed, 36 insertions(+), 27 deletions(-)

diff --git a/README.md b/README.md
index 4653b30..e1dda99 100644
--- a/README.md
+++ b/README.md
@@ -21,6 +21,10 @@ We use [Poetry](https://python-poetry.org/) for building, testing and dependency
 
 A very simple pipeline highlighting the main components of xMEN can be found in [notebooks/00_Getting_Started.ipynb](notebooks/00_Getting_Started.ipynb)
 
+## 🎓 Examples
+
+For more advanced use cases, check out the [examples](examples) folder. 
+
 ## 📂 Data Loading
 
 Usually, BigBIO-compatible datasets can just be loaded from the Hugging Face Hub:
diff --git a/examples/03_SNOMED_Linking_German.ipynb b/examples/03_SNOMED_Linking_German.ipynb
index 3f43598..cc723c2 100644
--- a/examples/03_SNOMED_Linking_German.ipynb
+++ b/examples/03_SNOMED_Linking_German.ipynb
@@ -345,11 +345,11 @@
     {
      "data": {
       "text/html": [
-       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">[01/05/24 13:54:14] </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO    </span> Loading hierarchical faiss index                                <a href=\"file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/sap_bert_linker.py\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">sap_bert_linker.py</span></a><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">:</span><a href=\"file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/sap_bert_linker.py#153\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">153</span></a>\n",
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">[01/05/24 14:06:56] </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO    </span> Loading hierarchical faiss index                                <a href=\"file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/sap_bert_linker.py\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">sap_bert_linker.py</span></a><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">:</span><a href=\"file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/sap_bert_linker.py#153\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">153</span></a>\n",
        "</pre>\n"
       ],
       "text/plain": [
-       "\u001b[2;36m[01/05/24 13:54:14]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO    \u001b[0m Loading hierarchical faiss index                                \u001b]8;id=183098;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/sap_bert_linker.py\u001b\\\u001b[2msap_bert_linker.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=912029;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/sap_bert_linker.py#153\u001b\\\u001b[2m153\u001b[0m\u001b]8;;\u001b\\\n"
+       "\u001b[2;36m[01/05/24 14:06:56]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO    \u001b[0m Loading hierarchical faiss index                                \u001b]8;id=377106;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/sap_bert_linker.py\u001b\\\u001b[2msap_bert_linker.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=654858;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/sap_bert_linker.py#153\u001b\\\u001b[2m153\u001b[0m\u001b]8;;\u001b\\\n"
       ]
      },
      "metadata": {},
@@ -364,7 +364,7 @@
        "</pre>\n"
       ],
       "text/plain": [
-       "\u001b[2;36m                   \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO    \u001b[0m Loading index from                                                 \u001b]8;id=599606;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py\u001b\\\u001b[2mfaiss_indexer.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=458220;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py#64\u001b\\\u001b[2m64\u001b[0m\u001b]8;;\u001b\\\n",
+       "\u001b[2;36m                   \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO    \u001b[0m Loading index from                                                 \u001b]8;id=487488;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py\u001b\\\u001b[2mfaiss_indexer.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=593198;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py#64\u001b\\\u001b[2m64\u001b[0m\u001b]8;;\u001b\\\n",
        "\u001b[2;36m                    \u001b[0m         \u001b[35m/home/Florian.Borchert/.cache/xmen/snomed_german/index/sapbert/\u001b[0m\u001b[95memb\u001b[0m \u001b[2m                   \u001b[0m\n",
        "\u001b[2;36m                    \u001b[0m         \u001b[95med_faiss_hier.pickle\u001b[0m                                               \u001b[2m                   \u001b[0m\n"
       ]
@@ -375,12 +375,12 @@
     {
      "data": {
       "text/html": [
-       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">[01/05/24 13:54:18] </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO    </span> Loaded index of type <span style=\"font-weight: bold\">&lt;</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff; font-weight: bold\">class</span><span style=\"color: #000000; text-decoration-color: #000000\"> </span><span style=\"color: #008000; text-decoration-color: #008000\">'faiss.swigfaiss.IndexHNSWFlat'</span><span style=\"font-weight: bold\">&gt;</span> and   <a href=\"file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">faiss_indexer.py</span></a><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">:</span><a href=\"file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py#66\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">66</span></a>\n",
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">[01/05/24 14:06:59] </span><span style=\"color: #000080; text-decoration-color: #000080\">INFO    </span> Loaded index of type <span style=\"font-weight: bold\">&lt;</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff; font-weight: bold\">class</span><span style=\"color: #000000; text-decoration-color: #000000\"> </span><span style=\"color: #008000; text-decoration-color: #008000\">'faiss.swigfaiss.IndexHNSWFlat'</span><span style=\"font-weight: bold\">&gt;</span> and   <a href=\"file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">faiss_indexer.py</span></a><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">:</span><a href=\"file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py#66\" target=\"_blank\"><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">66</span></a>\n",
        "<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">                    </span>         size <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1967771</span>                                                       <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">                   </span>\n",
        "</pre>\n"
       ],
       "text/plain": [
-       "\u001b[2;36m[01/05/24 13:54:18]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO    \u001b[0m Loaded index of type \u001b[1m<\u001b[0m\u001b[1;95mclass\u001b[0m\u001b[39m \u001b[0m\u001b[32m'faiss.swigfaiss.IndexHNSWFlat'\u001b[0m\u001b[1m>\u001b[0m and   \u001b]8;id=776198;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py\u001b\\\u001b[2mfaiss_indexer.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=995347;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py#66\u001b\\\u001b[2m66\u001b[0m\u001b]8;;\u001b\\\n",
+       "\u001b[2;36m[01/05/24 14:06:59]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO    \u001b[0m Loaded index of type \u001b[1m<\u001b[0m\u001b[1;95mclass\u001b[0m\u001b[39m \u001b[0m\u001b[32m'faiss.swigfaiss.IndexHNSWFlat'\u001b[0m\u001b[1m>\u001b[0m and   \u001b]8;id=951108;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py\u001b\\\u001b[2mfaiss_indexer.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=612483;file:///mnt/nfs/home/Florian.Borchert/workspace/xmen/xmen/linkers/faiss_indexer.py#66\u001b\\\u001b[2m66\u001b[0m\u001b]8;;\u001b\\\n",
        "\u001b[2;36m                    \u001b[0m         size \u001b[1;36m1967771\u001b[0m                                                       \u001b[2m                   \u001b[0m\n"
       ]
      },
@@ -390,7 +390,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "b477d6652c9e4326b9d9ab5f651d24a8",
+       "model_id": "6d71624d3cf4489eb2925db2738cf87f",
        "version_major": 2,
        "version_minor": 0
       },
@@ -698,7 +698,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "f48fee7d065841388efc407f3aeaeac5",
+       "model_id": "18e52613deee4c73984228a078099896",
        "version_major": 2,
        "version_minor": 0
       },
@@ -712,7 +712,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "fa32aa44068041bb97bd5ebfaaf77248",
+       "model_id": "1d389d47d62a49a89af6ce30ccbfb23c",
        "version_major": 2,
        "version_minor": 0
       },
@@ -726,7 +726,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "aff7ad16c32648c5afb548173e64dfa9",
+       "model_id": "4df5f4cd32fa4313bdd86e140f50d2a9",
        "version_major": 2,
        "version_minor": 0
       },
@@ -740,7 +740,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "237c7aa368b6418fbf9d7b1bb34d9c8e",
+       "model_id": "fa0789b1e274435c891bc8c9807475fa",
        "version_major": 2,
        "version_minor": 0
       },
@@ -775,7 +775,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "b9a550a3efd34c76a001844587b21087",
+       "model_id": "ff5b9aaf85934a6bbdddd51fbd60fe85",
        "version_major": 2,
        "version_minor": 0
       },
@@ -789,7 +789,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "bd911340658d487087b452f3e87c1d68",
+       "model_id": "202158acb5a949fda2d10d31774680f2",
        "version_major": 2,
        "version_minor": 0
       },
@@ -803,7 +803,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "5986edf5b65e4fd3a53ecb22d4728ace",
+       "model_id": "e00a9111e44d4fdcb5271c065f60b3b3",
        "version_major": 2,
        "version_minor": 0
       },
@@ -817,7 +817,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "a7a6159291d24e42a63e6e0a647dc41f",
+       "model_id": "dd32237ae8f545b59e9fe32588a7b0ff",
        "version_major": 2,
        "version_minor": 0
       },
@@ -1075,14 +1075,6 @@
     "get_dataframe(reranked, kb)"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "9ace9bf0-c0ef-4c9e-9bd6-6ecce3cf03b6",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
   {
    "cell_type": "code",
    "execution_count": null,
diff --git a/examples/README.md b/examples/README.md
index d8f8a05..f100ff5 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -1,13 +1,26 @@
 # Examples
 
+|Link|Language|Description|
+|---|---|---|
+|[01_BRONCO_German.ipynb](01_BRONCO_German.ipynb)|🇩🇪|Candidate generation and supervised re-ranking using the BRONCO corpus.<br>Shows how you can configure multiple dictionaries in the same config file.|
+|[02_GGPONC_German.ipynb](02_GGPONC_German.ipynb)|🇩🇪|Using a spaCy NER model with xMEN<br>Shows how to build a pipeline without labelled data using candidate generation, type filtering and pre-trained re-rankers|
+|[03_SNOMED_Linking_German.ipynb](03_SNOMED_Linking_German.ipynb)|🇩🇪|Linking against codes in UMLS source vocabularies (here SNOMED CT)|
+
+## External Links
+
 |Link|Description|
 |---|---|
-|[01_BRONCO_German.ipynb](01_BRONCO_German.ipynb)|Candidate generation and supervised re-ranking using the BRONCO corpus.<br>Shows how you can configure multiple dictionaries in the same config file.|
-|[02_GGPONC_German.ipynb](02_GGPONC_German.ipynb)|Using a spaCy NER model with xMEN<br>Shows how to build a pipeline without labelled data using candidate generation, type filtering and pre-trained re-rankers|
-|[03_SNOMED_Linking_German.ipynb](03_SNOMED_Linking_German.ipynb)|Linking against codes in UMLS source vocabularies (here SNOMED CT)|
-| | |
+| https://github.com/hpi-dhc/symptemist_biocreative_2023 | 🇪🇸 | BioCreative VIII SympTEMIST Challenge (1st place in entity linking track) |
 
 
 ## Benchmarks
 
-More examples for configurations can be found in the [Benchmarks](../benchmarks) folder.
\ No newline at end of file
+More examples for configurations can be found in the [Benchmarks](../benchmarks) folder.
+
+|Benchmark|Language|
+|---|---|
+|[Quaero](../benchmarks/benchmark/quaero.yaml)|🇫🇷|
+|[MedMentions](../benchmarks/benchmark/medmentions_en.yaml)|🇬🇧|
+|[DisTEMIST](../benchmarks/benchmark/distemist.yaml)|🇪🇸|
+|[BRONCO](../benchmarks/benchmark/bronco.yaml)|🇩🇪|
+|[Mantra](../benchmarks/benchmark/mantra.yaml)|🇬🇧 🇫🇷 🇪🇸 🇩🇪 🇳🇱|
\ No newline at end of file

From a846fbdb925fcc3048ec55e83090a9bf15b8db56 Mon Sep 17 00:00:00 2001
From: Florian Borchert <fl.borchert@gmail.com>
Date: Fri, 5 Jan 2024 14:10:03 +0100
Subject: [PATCH 3/7] Update README.md

---
 examples/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/README.md b/examples/README.md
index f100ff5..41b843f 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -3,7 +3,7 @@
 |Link|Language|Description|
 |---|---|---|
 |[01_BRONCO_German.ipynb](01_BRONCO_German.ipynb)|🇩🇪|Candidate generation and supervised re-ranking using the BRONCO corpus.<br>Shows how you can configure multiple dictionaries in the same config file.|
-|[02_GGPONC_German.ipynb](02_GGPONC_German.ipynb)|🇩🇪|Using a spaCy NER model with xMEN<br>Shows how to build a pipeline without labelled data using candidate generation, type filtering and pre-trained re-rankers|
+|[02_spaCy_German.ipynb](02_spaCy_German.ipynb)|🇩🇪|Using a spaCy NER model with xMEN<br>Shows how to build a pipeline without labelled data using candidate generation, type filtering and pre-trained re-rankers|
 |[03_SNOMED_Linking_German.ipynb](03_SNOMED_Linking_German.ipynb)|🇩🇪|Linking against codes in UMLS source vocabularies (here SNOMED CT)|
 
 ## External Links
@@ -23,4 +23,4 @@ More examples for configurations can be found in the [Benchmarks](../benchmarks)
 |[MedMentions](../benchmarks/benchmark/medmentions_en.yaml)|🇬🇧|
 |[DisTEMIST](../benchmarks/benchmark/distemist.yaml)|🇪🇸|
 |[BRONCO](../benchmarks/benchmark/bronco.yaml)|🇩🇪|
-|[Mantra](../benchmarks/benchmark/mantra.yaml)|🇬🇧 🇫🇷 🇪🇸 🇩🇪 🇳🇱|
\ No newline at end of file
+|[Mantra](../benchmarks/benchmark/mantra.yaml)|🇬🇧 🇫🇷 🇪🇸 🇩🇪 🇳🇱|

From 7db95a5a8ac77f919de765b8e87a0ae276d33b54 Mon Sep 17 00:00:00 2001
From: Florian Borchert <fl.borchert@gmail.com>
Date: Fri, 5 Jan 2024 14:11:29 +0100
Subject: [PATCH 4/7] Update README.md

---
 examples/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/README.md b/examples/README.md
index 41b843f..82dd450 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -8,8 +8,8 @@
 
 ## External Links
 
-|Link|Description|
-|---|---|
+|Link|Language|Description|
+|---|---|---|
 | https://github.com/hpi-dhc/symptemist_biocreative_2023 | 🇪🇸 | BioCreative VIII SympTEMIST Challenge (1st place in entity linking track) |
 
 

From cc22698f426b27792215500e5acbd41535e4d67d Mon Sep 17 00:00:00 2001
From: Florian Borchert <fl.borchert@gmail.com>
Date: Fri, 5 Jan 2024 14:12:19 +0100
Subject: [PATCH 5/7] Update README.md

---
 examples/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/README.md b/examples/README.md
index 82dd450..a0741ab 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -15,7 +15,7 @@
 
 ## Benchmarks
 
-More examples for configurations can be found in the [Benchmarks](../benchmarks) folder.
+More examples for configurations can be found in the [Benchmarks](../benchmarks/benchmark) folder.
 
 |Benchmark|Language|
 |---|---|

From 04a414bd2ac6094991d579ef965e077df73fb78e Mon Sep 17 00:00:00 2001
From: Florian Borchert <Florian.Borchert@hpi.de>
Date: Fri, 5 Jan 2024 14:30:11 +0100
Subject: [PATCH 6/7] Black

---
 examples/dicts/umls_source.py | 25 +++++++++++++++++--------
 examples/util.py              | 26 ++++++++++++++++++--------
 2 files changed, 35 insertions(+), 16 deletions(-)

diff --git a/examples/dicts/umls_source.py b/examples/dicts/umls_source.py
index a32222d..7f8ab32 100644
--- a/examples/dicts/umls_source.py
+++ b/examples/dicts/umls_source.py
@@ -3,6 +3,7 @@
 from collections import defaultdict
 from xmen.log import logger
 
+
 def get_concept_details(cfg):
     mrconso = "MRCONSO.RRF"
     concept_details = {}
@@ -23,12 +24,14 @@ def get_concept_details(cfg):
                 cui = concept["CUI"]
                 if id_key:
                     sid = concept[id_key]
-                else:                    
+                else:
                     sid = concept["SDUI"]
                     if not sid:
                         sid = concept["SCUI"]
                 if not sid:
-                    logger.warn(f"Skipping concept with CUI {cui} because we could not find a valid source vocabulary ID")
+                    logger.warn(
+                        f"Skipping concept with CUI {cui} because we could not find a valid source vocabulary ID"
+                    )
                 name = concept["STR"]
                 if sid in concept_details:
                     concept_details[sid]["aliases"].append(name)
@@ -36,19 +39,25 @@ def get_concept_details(cfg):
                     concept_details[sid] = {"concept_id": sid, "canonical_name": name, "types": [], "aliases": []}
                 scui2cui[sid].append(cui)
 
-    if umls_extend := cfg.dict.custom.get('umls_extend'):
+    if umls_extend := cfg.dict.custom.get("umls_extend"):
         # Optionally extend with UMLS synonyms
-        other_umls_concepts = get_umls_concepts(meta_path, 
-            umls_extend.get("lang"), sabs=umls_extend.get("sabs"), sources=umls_extend.get("sources"), semantic_groups=umls_extend.get("semantic_groups"), semantic_types=umls_extend.get("semantic_types"))
+        other_umls_concepts = get_umls_concepts(
+            meta_path,
+            umls_extend.get("lang"),
+            sabs=umls_extend.get("sabs"),
+            sources=umls_extend.get("sources"),
+            semantic_groups=umls_extend.get("semantic_groups"),
+            semantic_types=umls_extend.get("semantic_types"),
+        )
 
         for scui, concept in tqdm(concept_details.items()):
             for cui in scui2cui[scui]:
                 if cui in other_umls_concepts:
-                    for t in other_umls_concepts[cui]['types']:
+                    for t in other_umls_concepts[cui]["types"]:
                         if not t in concept["types"]:
                             concept["types"].append(t)
                     for new_alias in other_umls_concepts[cui]["aliases"] + [other_umls_concepts[cui]["canonical_name"]]:
                         if new_alias not in concept["aliases"] and new_alias != concept["canonical_name"]:
-                            concept["aliases"].append(new_alias)                    
-    
+                            concept["aliases"].append(new_alias)
+
     return concept_details
diff --git a/examples/util.py b/examples/util.py
index a62927f..1f9f7d4 100644
--- a/examples/util.py
+++ b/examples/util.py
@@ -1,15 +1,25 @@
 import pandas as pd
 
+
 def get_dataframe(predictions, kb):
     ents = []
     for d in predictions:
-        for e in d['entities']:
-            span = ' '.join(e['text'])
-            label = e['type']
-            top_concept = e['normalized'][0] if len(e['normalized']) > 0 else None        
+        for e in d["entities"]:
+            span = " ".join(e["text"])
+            label = e["type"]
+            top_concept = e["normalized"][0] if len(e["normalized"]) > 0 else None
             if top_concept:
-                cui = top_concept['db_id']
-                ents.append({'mention' : span, 'class' :  label, 'cui' : cui, 'canonical name' : kb.cui_to_entity[cui].canonical_name, 'linked by' : top_concept['predicted_by'], 'score' : top_concept['score']})
+                cui = top_concept["db_id"]
+                ents.append(
+                    {
+                        "mention": span,
+                        "class": label,
+                        "cui": cui,
+                        "canonical name": kb.cui_to_entity[cui].canonical_name,
+                        "linked by": top_concept["predicted_by"],
+                        "score": top_concept["score"],
+                    }
+                )
             else:
-                ents.append({'mention' : span, 'class' :  label, 'cui' : 'Not linkable'})
-    return pd.DataFrame(ents)
\ No newline at end of file
+                ents.append({"mention": span, "class": label, "cui": "Not linkable"})
+    return pd.DataFrame(ents)

From 7310147cef3e480b384036fcd2bc6eac2f2c7b70 Mon Sep 17 00:00:00 2001
From: Florian Borchert <Florian.Borchert@hpi.de>
Date: Fri, 5 Jan 2024 15:01:31 +0100
Subject: [PATCH 7/7] Move DisTEMIST

---
 examples/old_examples/conf/distemist.yaml                  | 7 -------
 .../01_BioASQ_DisTEMIST.ipynb                              | 0
 .../{distemist => distemist_clef2023}/distemist.py         | 0
 .../distemist_bioasq.yaml                                  | 0
 .../{distemist => distemist_clef2023}/notebook_util.py     | 0
 5 files changed, 7 deletions(-)
 delete mode 100644 examples/old_examples/conf/distemist.yaml
 rename examples/old_examples/{distemist => distemist_clef2023}/01_BioASQ_DisTEMIST.ipynb (100%)
 rename examples/old_examples/{distemist => distemist_clef2023}/distemist.py (100%)
 rename examples/old_examples/{distemist => distemist_clef2023}/distemist_bioasq.yaml (100%)
 rename examples/old_examples/{distemist => distemist_clef2023}/notebook_util.py (100%)

diff --git a/examples/old_examples/conf/distemist.yaml b/examples/old_examples/conf/distemist.yaml
deleted file mode 100644
index a491cae..0000000
--- a/examples/old_examples/conf/distemist.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-name: distemist
-
-dict:
-  custom:
-    lang: 
-      - es
-    distemist_path: local_files/dictionary_distemist.tsv
\ No newline at end of file
diff --git a/examples/old_examples/distemist/01_BioASQ_DisTEMIST.ipynb b/examples/old_examples/distemist_clef2023/01_BioASQ_DisTEMIST.ipynb
similarity index 100%
rename from examples/old_examples/distemist/01_BioASQ_DisTEMIST.ipynb
rename to examples/old_examples/distemist_clef2023/01_BioASQ_DisTEMIST.ipynb
diff --git a/examples/old_examples/distemist/distemist.py b/examples/old_examples/distemist_clef2023/distemist.py
similarity index 100%
rename from examples/old_examples/distemist/distemist.py
rename to examples/old_examples/distemist_clef2023/distemist.py
diff --git a/examples/old_examples/distemist/distemist_bioasq.yaml b/examples/old_examples/distemist_clef2023/distemist_bioasq.yaml
similarity index 100%
rename from examples/old_examples/distemist/distemist_bioasq.yaml
rename to examples/old_examples/distemist_clef2023/distemist_bioasq.yaml
diff --git a/examples/old_examples/distemist/notebook_util.py b/examples/old_examples/distemist_clef2023/notebook_util.py
similarity index 100%
rename from examples/old_examples/distemist/notebook_util.py
rename to examples/old_examples/distemist_clef2023/notebook_util.py

	mention	class
0	Cetuximab	Clinical_Drug
1	monoklonaler Antikörper	Clinical_Drug
2	epidermalen Wachstumsfaktorrezeptor	Nutrient_or_Body_Substance
3	EGFR	Nutrient_or_Body_Substance
4	Therapie des fortgeschrittenen kolorektalen Ka...	Therapeutic
5	fortgeschrittenen kolorektalen Karzinoms	Diagnosis_or_Pathology
6	Irinotecan	Clinical_Drug
7	FOLFOX	Therapeutic
8	Versagen einer Behandlung	Other_Finding
9	Behandlung mit Oxaliplatin und Irinotecan	Therapeutic
10	Oxaliplatin	Clinical_Drug
11	Irinotecan	Clinical_Drug
12	HPV-Diagnostik	Diagnostic
13	Plattenepithelkarzinom der Mundhöhle	Diagnosis_or_Pathology
	mention	class	cui	canonical name	linked by	score
0	Cetuximab	Clinical_Drug	409401002	Product containing cetuximab (medicinal product)	[ngram, sapbert]	1.000000
1	monoklonaler Antikörper	Clinical_Drug	49616005	Monoclonal antibody	[ngram, sapbert]	0.982318
2	epidermalen Wachstumsfaktorrezeptor	Nutrient_or_Body_Substance	86960007	Epidermal growth factor-urogastrone receptor	[sapbert]	0.937485
3	EGFR	Nutrient_or_Body_Substance	86960007	Epidermal growth factor-urogastrone receptor	[ngram, sapbert]	1.000000
4	Therapie des fortgeschrittenen kolorektalen Ka...	Therapeutic	1217692004	Metastasis from malignant neoplasm of colon an...	[ngram, sapbert]	0.696480
5	fortgeschrittenen kolorektalen Karzinoms	Diagnosis_or_Pathology	1217692004	Metastasis from malignant neoplasm of colon an...	[ngram, sapbert]	0.843411
6	Irinotecan	Clinical_Drug	372538008	Irinotecan	[ngram, sapbert]	1.000000
7	FOLFOX	Therapeutic	699297004	Ohdo syndrome, Maat-Kievit-Brunner type	[sapbert]	0.812118
8	Versagen einer Behandlung	Other_Finding	7058009	Noncompliance with treatment	[ngram, sapbert]	0.881669
9	Behandlung mit Oxaliplatin und Irinotecan	Therapeutic	447053005	Oxaliplatin desensitization therapy	[ngram, sapbert]	0.673975
10	Oxaliplatin	Clinical_Drug	395814003	Oxaliplatin	[ngram, sapbert]	1.000000
11	Irinotecan	Clinical_Drug	372538008	Irinotecan	[ngram, sapbert]	1.000000
12	HPV-Diagnostik	Diagnostic	700152009	Human papilloma virus screening	[sapbert]	0.913146
13	Plattenepithelkarzinom der Mundhöhle	Diagnosis_or_Pathology	307502000	Squamous cell carcinoma of mouth	[ngram, sapbert]	0.987126
	mention	class	cui	canonical name	linked by	score
0	Cetuximab	Clinical_Drug	409400001	Cetuximab	[ngram, sapbert]	0.040032
1	monoklonaler Antikörper	Clinical_Drug	49616005	Monoclonal antibody	[ngram, sapbert]	0.043237
2	epidermalen Wachstumsfaktorrezeptor	Nutrient_or_Body_Substance	86960007	Epidermal growth factor-urogastrone receptor	[sapbert]	0.022354
3	EGFR	Nutrient_or_Body_Substance	86960007	Epidermal growth factor-urogastrone receptor	[ngram, sapbert]	0.033175
4	Therapie des fortgeschrittenen kolorektalen Ka...	Therapeutic	1217692004	Metastasis from malignant neoplasm of colon an...	[ngram, sapbert]	0.017036
5	fortgeschrittenen kolorektalen Karzinoms	Diagnosis_or_Pathology	126837005	Tumor of large intestine	[ngram, sapbert]	0.017528
6	Irinotecan	Clinical_Drug	372538008	Irinotecan	[ngram, sapbert]	0.042663
7	FOLFOX	Therapeutic	461391000124102	Folfox protocol	[ngram, sapbert]	0.018860
8	Versagen einer Behandlung	Other_Finding	266721009	Absent response to treatment	[sapbert]	0.017751
9	Behandlung mit Oxaliplatin und Irinotecan	Therapeutic	447053005	Oxaliplatin desensitization therapy	[ngram, sapbert]	0.017154
10	Oxaliplatin	Clinical_Drug	395814003	Oxaliplatin	[ngram, sapbert]	0.039662
11	Irinotecan	Clinical_Drug	372538008	Irinotecan	[ngram, sapbert]	0.045797
12	HPV-Diagnostik	Diagnostic	700152009	Human papilloma virus screening	[sapbert]	0.019670
13	Plattenepithelkarzinom der Mundhöhle	Diagnosis_or_Pathology	307502000	Squamous cell carcinoma of mouth	[ngram, sapbert]	0.037824
	_word_len	_abbrev	gt_start	gt_end	gt_text	pred_start	pred_end	pred_text	ner_match_type	gold_concept	gold_type	pred_index	pred_index_score	pred_top	pred_top_score	corpus_id	document_id
0	1.0	False	11.0	17.0	[entity]	11	15	[entity]	be	{'db_id': 'c1', 'target_kb': 'UMLS', 'type': N...	None	0	None	c1	None	x	1
1	1.0	False	11.0	17.0	[entity]	11	17	[entity]	be	{'db_id': 'c1', 'target_kb': 'UMLS', 'type': N...	None	0	None	c1	None	x	1
2	NaN	None	NaN	NaN	None	16	17	[entity]	fp	None	None	-1	None	None	None	x	1