From 7d7c57e6b4cbaf87e7b1b33296692ad7eb513063 Mon Sep 17 00:00:00 2001
From: Eljas Roellin <eljas.roellin@ikmail.com>
Date: Sat, 2 Nov 2024 23:00:00 +0100
Subject: [PATCH 01/43] first change in notebook

---
 .../tutorial_omop_visualization.ipynb         | 329 +++++++++++++-----
 1 file changed, 244 insertions(+), 85 deletions(-)
diff --git a/docs/notebooks/tutorial_omop_visualization.ipynb b/docs/notebooks/tutorial_omop_visualization.ipynb
index bc60f09..982a744 100644
--- a/docs/notebooks/tutorial_omop_visualization.ipynb
+++ b/docs/notebooks/tutorial_omop_visualization.ipynb
@@ -55,18 +55,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 1,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "The autoreload extension is already loaded. To reload it, use:\n",
-      "  %reload_ext autoreload\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "%load_ext autoreload\n",
     "%autoreload 2"
@@ -74,7 +65,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -88,7 +79,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -97,7 +88,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -111,7 +102,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -121,7 +112,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -144,7 +135,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -160,7 +151,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
@@ -168,7 +159,8 @@
      "output_type": "stream",
      "text": [
       "Path to data exists, load tables from there: ehrapy_data/mimic-iv-demo-data-in-the-omop-common-data-model-0.9\n",
-      "missing tables:  [['concept'], ['vocabulary'], ['domain'], ['concept_class'], ['concept_relationship'], ['relationship'], ['concept_synonym'], ['concept_ancestor'], ['source_to_concept_map'], ['drug_strength']]\n"
+      "missing tables:  ['domain', 'concept_class', 'relationship', 'concept_synonym', 'concept_ancestor', 'source_to_concept_map', 'drug_strength']\n",
+      "unused files:  ['attribute_definition.csv', 'cohort_attribute.csv']\n"
      ]
     }
    ],
@@ -185,7 +177,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
@@ -231,96 +223,108 @@
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>condition_era</td>\n",
+       "      <td>concept</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>5</th>\n",
-       "      <td>condition_occurrence</td>\n",
+       "      <td>concept_relationship</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>6</th>\n",
-       "      <td>cost</td>\n",
+       "      <td>condition_era</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7</th>\n",
-       "      <td>death</td>\n",
+       "      <td>condition_occurrence</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>8</th>\n",
-       "      <td>device_exposure</td>\n",
+       "      <td>cost</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>9</th>\n",
-       "      <td>dose_era</td>\n",
+       "      <td>death</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>10</th>\n",
-       "      <td>drug_era</td>\n",
+       "      <td>device_exposure</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>11</th>\n",
-       "      <td>drug_exposure</td>\n",
+       "      <td>dose_era</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>12</th>\n",
-       "      <td>fact_relationship</td>\n",
+       "      <td>drug_era</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>13</th>\n",
-       "      <td>location</td>\n",
+       "      <td>drug_exposure</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>14</th>\n",
-       "      <td>measurement</td>\n",
+       "      <td>fact_relationship</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>15</th>\n",
-       "      <td>metadata</td>\n",
+       "      <td>location</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>16</th>\n",
-       "      <td>note</td>\n",
+       "      <td>measurement</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>17</th>\n",
-       "      <td>note_nlp</td>\n",
+       "      <td>metadata</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>18</th>\n",
-       "      <td>observation</td>\n",
+       "      <td>note</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>19</th>\n",
-       "      <td>observation_period</td>\n",
+       "      <td>note_nlp</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>20</th>\n",
-       "      <td>payer_plan_period</td>\n",
+       "      <td>observation</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>21</th>\n",
-       "      <td>person</td>\n",
+       "      <td>observation_period</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>22</th>\n",
-       "      <td>procedure_occurrence</td>\n",
+       "      <td>payer_plan_period</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>23</th>\n",
-       "      <td>provider</td>\n",
+       "      <td>person</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>24</th>\n",
-       "      <td>specimen</td>\n",
+       "      <td>procedure_occurrence</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>25</th>\n",
-       "      <td>visit_detail</td>\n",
+       "      <td>provider</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>26</th>\n",
+       "      <td>specimen</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>27</th>\n",
+       "      <td>visit_detail</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>28</th>\n",
        "      <td>visit_occurrence</td>\n",
        "    </tr>\n",
+       "    <tr>\n",
+       "      <th>29</th>\n",
+       "      <td>vocabulary</td>\n",
+       "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
@@ -331,32 +335,35 @@
        "1             cdm_source\n",
        "2                 cohort\n",
        "3      cohort_definition\n",
-       "4          condition_era\n",
-       "5   condition_occurrence\n",
-       "6                   cost\n",
-       "7                  death\n",
-       "8        device_exposure\n",
-       "9               dose_era\n",
-       "10              drug_era\n",
-       "11         drug_exposure\n",
-       "12     fact_relationship\n",
-       "13              location\n",
-       "14           measurement\n",
-       "15              metadata\n",
-       "16                  note\n",
-       "17              note_nlp\n",
-       "18           observation\n",
-       "19    observation_period\n",
-       "20     payer_plan_period\n",
-       "21                person\n",
-       "22  procedure_occurrence\n",
-       "23              provider\n",
-       "24              specimen\n",
-       "25          visit_detail\n",
-       "26      visit_occurrence"
+       "4                concept\n",
+       "5   concept_relationship\n",
+       "6          condition_era\n",
+       "7   condition_occurrence\n",
+       "8                   cost\n",
+       "9                  death\n",
+       "10       device_exposure\n",
+       "11              dose_era\n",
+       "12              drug_era\n",
+       "13         drug_exposure\n",
+       "14     fact_relationship\n",
+       "15              location\n",
+       "16           measurement\n",
+       "17              metadata\n",
+       "18                  note\n",
+       "19              note_nlp\n",
+       "20           observation\n",
+       "21    observation_period\n",
+       "22     payer_plan_period\n",
+       "23                person\n",
+       "24  procedure_occurrence\n",
+       "25              provider\n",
+       "26              specimen\n",
+       "27          visit_detail\n",
+       "28      visit_occurrence\n",
+       "29            vocabulary"
       ]
      },
-     "execution_count": 27,
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -368,13 +375,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "eb69f3700fb343a8b204fde5a22a5d2b",
+       "model_id": "9ec20041bac2441283e9998549a5a1aa",
        "version_major": 2,
        "version_minor": 0
       },
@@ -392,7 +399,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
@@ -403,7 +410,7 @@
        "             shape of .r: (0, 0, 0) "
       ]
      },
-     "execution_count": 29,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -442,13 +449,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "260450539f1b4ebba16f17460d50d40f",
+       "model_id": "df43d6348b9f4c089065ce35d2a7ed78",
        "version_major": 2,
        "version_minor": 0
       },
@@ -466,37 +473,189 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 14,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "multiple units for features: [[  0]\n",
+      " [  1]\n",
+      " [ 23]\n",
+      " [ 55]\n",
+      " [122]\n",
+      " [160]\n",
+      " [245]\n",
+      " [296]\n",
+      " [306]\n",
+      " [349]\n",
+      " [418]]\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>concept_id</th>\n",
+       "      <th>unit_concept_id</th>\n",
+       "      <th>no_units</th>\n",
+       "      <th>multiple_units</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>9557</td>\n",
+       "      <td>False</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>0</td>\n",
+       "      <td>8749</td>\n",
+       "      <td>False</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>0</td>\n",
+       "      <td>8923</td>\n",
+       "      <td>False</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>0</td>\n",
+       "      <td>8840</td>\n",
+       "      <td>False</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>0</td>\n",
+       "      <td>8859</td>\n",
+       "      <td>False</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>464</th>\n",
+       "      <td>42527140</td>\n",
+       "      <td>44777590</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>465</th>\n",
+       "      <td>42868642</td>\n",
+       "      <td>&lt;NA&gt;</td>\n",
+       "      <td>True</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>466</th>\n",
+       "      <td>43055270</td>\n",
+       "      <td>&lt;NA&gt;</td>\n",
+       "      <td>True</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>467</th>\n",
+       "      <td>46236952</td>\n",
+       "      <td>&lt;NA&gt;</td>\n",
+       "      <td>True</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>468</th>\n",
+       "      <td>2000000000</td>\n",
+       "      <td>8554</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>469 rows × 4 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "     concept_id  unit_concept_id  no_units  multiple_units\n",
+       "0             0             9557     False            True\n",
+       "1             0             8749     False            True\n",
+       "2             0             8923     False            True\n",
+       "3             0             8840     False            True\n",
+       "4             0             8859     False            True\n",
+       "..          ...              ...       ...             ...\n",
+       "464    42527140         44777590     False           False\n",
+       "465    42868642             <NA>      True           False\n",
+       "466    43055270             <NA>      True           False\n",
+       "467    46236952             <NA>      True           False\n",
+       "468  2000000000             8554     False           False\n",
+       "\n",
+       "[469 rows x 4 columns]"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "edata = ed.io.omop.setup_variables(\n",
-    "    backend_handle=con,\n",
     "    edata=edata,\n",
-    "    tables=list(selected_vars.value),\n",
-    "    start_time=\"observation_period_start_date\",\n",
-    "    interval_length_number=28,\n",
+    "    backend_handle=con,\n",
+    "    data_tables=list(selected_vars.value),\n",
+    "    data_field_to_keep=[\"value_as_number\"],\n",
+    "    interval_length_number=20,\n",
     "    interval_length_unit=\"day\",\n",
-    "    num_intervals=\"max_observation_duration\",\n",
+    "    num_intervals=10,\n",
     "    concept_ids=\"all\",\n",
     "    aggregation_strategy=\"last\",\n",
-    ")"
+    "    enrich_var_with_feature_info=True,\n",
+    "    enrich_var_with_unit_info=False,\n",
+    ")\n",
+    "edata.uns[\"unit_report_measurement\"]"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "EHRData object with n_obs x n_var = 100 x 450, and a timeseries of 320 steps.\n",
+       "EHRData object with n_obs x n_var = 100 x 450, and a timeseries of 10 steps.\n",
        "             shape of .X: (0, 0) \n",
-       "             shape of .r: (100, 450, 320) "
+       "             shape of .r: (100, 450, 10) "
       ]
      },
-     "execution_count": 32,
+     "execution_count": 15,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -507,7 +666,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [],
    "source": [

From 09ce9b540911b6a097abc1fc8db5e8c61c337365 Mon Sep 17 00:00:00 2001
From: Eljas Roellin <eljas.roellin@ikmail.com>
Date: Mon, 4 Nov 2024 11:29:26 +0100
Subject: [PATCH 02/43] inspect why 2 omop dt fail on github ci

---
 src/ehrdata/dt/datasets.py | 19 +++++++++----------
 tests/test_dt/test_dt.py   | 37 +++++++++++++++++++++++--------------
 2 files changed, 32 insertions(+), 24 deletions(-)

diff --git a/src/ehrdata/dt/datasets.py b/src/ehrdata/dt/datasets.py
index 33545be..3db0469 100644
--- a/src/ehrdata/dt/datasets.py
+++ b/src/ehrdata/dt/datasets.py
@@ -75,7 +75,7 @@ def mimic_iv_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = N
         >>> con.execute("SHOW TABLES;").fetchall()
     """
     if data_path is None:
-        data_path = "ehrapy_data/mimic-iv-demo-data-in-the-omop-common-data-model-0.9"
+        data_path = Path("ehrapy_data/mimic-iv-demo-data-in-the-omop-common-data-model-0.9")
 
     if os.path.exists(data_path):
         print(f"Path to data exists, load tables from there: {data_path}")
@@ -85,7 +85,7 @@ def mimic_iv_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = N
         response = requests.get(URL)
 
         if response.status_code == 200:
-            # Step 2: Use zipfile and io to open the ZIP file in memory
+            # Use zipfile and io to open the ZIP file in memory
             with zipfile.ZipFile(io.BytesIO(response.content)) as z:
                 # Extract all contents of the ZIP file
                 z.extractall("ehrapy_data")  # Specify the folder where files will be extracted
@@ -93,8 +93,8 @@ def mimic_iv_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = N
         else:
             print(f"Failed to download the file. Status code: {response.status_code}")
             return
-    # TODO: capitalization, and lowercase, and containing the name
-    return _set_up_duckdb(data_path + "/1_omop_data_csv", backend_handle, prefix="2b_")
+
+    return _set_up_duckdb(data_path / "1_omop_data_csv", backend_handle, prefix="2b_")
 
 
 def gibleed_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = None) -> None:
@@ -133,9 +133,6 @@ def gibleed_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = No
         response = requests.get(URL)
 
         if response.status_code == 200:
-            # extract_path = data_path / "gibleed_data_csv"
-            # extract_path.mkdir(parents=True, exist_ok=True)
-
             # Use zipfile and io to open the ZIP file in memory
             with zipfile.ZipFile(io.BytesIO(response.content)) as z:
                 # Extract all contents of the ZIP file into the correct subdirectory
@@ -144,16 +141,15 @@ def gibleed_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = No
 
         else:
             print(f"Failed to download the file. Status code: {response.status_code}")
+            return
 
-    # extracted_folder = next(data_path.iterdir(), data_path)
-    # extracted_folder = next((folder for folder in data_path.iterdir() if folder.is_dir() and "_csv" in folder.name and "__MACOSX" not in folder.name), data_path)
     return _set_up_duckdb(data_path / "GiBleed_5.3", backend_handle)
 
 
 def synthea27nj_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = None) -> None:
     """Loads the Synthea27NJ dataset in the OMOP Common Data model.
 
-    More details: https://github.com/darwin-eu/EunomiaDatasets/tree/main/datasets/Synthea27Nj.
+    More details: https://github.com/OHDSI/EunomiaDatasets/tree/main/datasets/Synthea27Nj.
 
     Parameters
     ----------
@@ -214,3 +210,6 @@ def mimic_ii(backend_handle: DuckDBPyConnection, data_path: Path | None = None)
     """Loads the MIMIC2 dataset"""
     # TODO: replace mimic_ii as is in ehrapy with its dict-of-table return time - map variables to OMOP?
     raise NotImplementedError()
+
+
+# TODO: physionet2012, physionet2019
diff --git a/tests/test_dt/test_dt.py b/tests/test_dt/test_dt.py
index 72fa7a3..219bf35 100644
--- a/tests/test_dt/test_dt.py
+++ b/tests/test_dt/test_dt.py
@@ -1,25 +1,34 @@
+from pathlib import Path
+
 import duckdb
+import pytest
 
 import ehrdata as ed
 
+TEST_DATA_DIR = Path(__file__).parent / "ehrapy_data"
+
 
-def test_mimic_iv_omop():
+@pytest.fixture(scope="function")
+def duckdb_connection():
+    """Fixture to create and return a DuckDB connection for testing."""
     con = duckdb.connect()
-    ed.dt.mimic_iv_omop(backend_handle=con)
-    assert len(con.execute("SHOW TABLES").df()) == 30
+    yield con
     con.close()
 
 
-# TODO
-# def test_gibleed_omop():
-#     con = duckdb.connect()
-#     ed.dt.gibleed_omop(backend_handle=con)
-#     assert len(con.execute("SHOW TABLES").df()) == 36
-#     con.close()
+def test_mimic_iv_omop(duckdb_connection):
+    ed.dt.mimic_iv_omop(backend_handle=duckdb_connection)
+    assert len(duckdb_connection.execute("SHOW TABLES").df()) == 30
+    assert duckdb_connection.execute("SELECT * FROM person").df().shape == (100, 18)
+
+
+def test_gibleed_omop(duckdb_connection):
+    ed.dt.gibleed_omop(backend_handle=duckdb_connection)
+    assert len(duckdb_connection.execute("SHOW TABLES").df()) == 36
+    assert duckdb_connection.execute("SELECT * FROM person").df().shape == (2694, 18)
 
 
-# def test_synthea27nj_omop():
-#     con = duckdb.connect()
-#     ed.dt.synthea27nj_omop(backend_handle=con)
-#     assert len(con.execute("SHOW TABLES").df()) == 37
-#     con.close()
+def test_synthea27nj_omop(duckdb_connection):
+    ed.dt.synthea27nj_omop(backend_handle=duckdb_connection)
+    assert len(duckdb_connection.execute("SHOW TABLES").df()) == 37
+    assert duckdb_connection.execute("SELECT * FROM person").df().shape == (28, 18)

From 54fe86e2cc9f86d6675fbed91915fa68781f4fcf Mon Sep 17 00:00:00 2001
From: Eljas Roellin <eljas.roellin@ikmail.com>
Date: Mon, 4 Nov 2024 11:36:07 +0100
Subject: [PATCH 03/43] inspect why 2 omop dt fail on github ci

---
 src/ehrdata/dt/datasets.py | 17 +++--------------
 1 file changed, 3 insertions(+), 14 deletions(-)

diff --git a/src/ehrdata/dt/datasets.py b/src/ehrdata/dt/datasets.py
index 3db0469..4eb3182 100644
--- a/src/ehrdata/dt/datasets.py
+++ b/src/ehrdata/dt/datasets.py
@@ -182,28 +182,17 @@ def synthea27nj_omop(backend_handle: DuckDBPyConnection, data_path: Path | None
         response = requests.get(URL)
 
         if response.status_code == 200:
-            extract_path = data_path / "synthea27nj_omop_csv"
-            extract_path.mkdir(parents=True, exist_ok=True)
-
             # Use zipfile and io to open the ZIP file in memory
             with zipfile.ZipFile(io.BytesIO(response.content)) as z:
                 # Extract all contents of the ZIP file into the correct subdirectory
-                z.extractall(extract_path)  # Extracting to 'extract_path'
-                print(f"Download successful. ZIP file downloaded and extracted successfully to {extract_path}.")
+                z.extractall(data_path)  # Extracting to 'extract_path'
+                print(f"Download successful. ZIP file downloaded and extracted successfully to {data_path}.")
 
         else:
             print(f"Failed to download the file. Status code: {response.status_code}")
             return
 
-    extracted_folder = next(
-        (
-            folder
-            for folder in data_path.iterdir()
-            if folder.is_dir() and "_csv" in folder.name and "__MACOSX" not in folder.name
-        ),
-        data_path,
-    )
-    return _set_up_duckdb(extracted_folder, backend_handle)
+    return _set_up_duckdb(data_path, backend_handle)
 
 
 def mimic_ii(backend_handle: DuckDBPyConnection, data_path: Path | None = None) -> None:

From bebe5951153164bf2cc1fa9e2278fbe924010f69 Mon Sep 17 00:00:00 2001
From: Eljas Roellin <eljas.roellin@ikmail.com>
Date: Mon, 4 Nov 2024 11:42:29 +0100
Subject: [PATCH 04/43] inspect why 2 omop dt fail on github ci

---
 src/ehrdata/dt/datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ehrdata/dt/datasets.py b/src/ehrdata/dt/datasets.py
index 4eb3182..7103771 100644
--- a/src/ehrdata/dt/datasets.py
+++ b/src/ehrdata/dt/datasets.py
@@ -36,7 +36,7 @@ def _set_up_duckdb(path: Path, backend_handle: DuckDBPyConnection, prefix: str =
 
             backend_handle.register(
                 file_name_trunk.replace(prefix, ""),
-                backend_handle.read_csv(f"{path}/{file_name_trunk}.csv", dtype=dtype),
+                backend_handle.read_csv(f"{path}/{file_name}", dtype=dtype),
             )
         else:
             unused_files.append(file_name)

From 4c879a1511248e655b9b7d20300ae5595e181721 Mon Sep 17 00:00:00 2001
From: Eljas Roellin <eljas.roellin@ikmail.com>
Date: Mon, 4 Nov 2024 12:06:25 +0100
Subject: [PATCH 05/43] reduce redundancy; enhance docstrings

---
 src/ehrdata/dt/datasets.py | 109 ++++++++++++++++++-------------------
 1 file changed, 52 insertions(+), 57 deletions(-)

diff --git a/src/ehrdata/dt/datasets.py b/src/ehrdata/dt/datasets.py
index 7103771..c5bb1b8 100644
--- a/src/ehrdata/dt/datasets.py
+++ b/src/ehrdata/dt/datasets.py
@@ -18,6 +18,7 @@ def _get_table_list() -> list:
 
 
 def _set_up_duckdb(path: Path, backend_handle: DuckDBPyConnection, prefix: str = "") -> None:
+    """Create tables in the backend from the CSV files in the path from datasets in the OMOP Common Data model."""
     tables = _get_table_list()
 
     used_tables = []
@@ -49,10 +50,40 @@ def _set_up_duckdb(path: Path, backend_handle: DuckDBPyConnection, prefix: str =
     print("unused files: ", unused_files)
 
 
+def _setup_eunomia_datasets(
+    backend_handle: DuckDBPyConnection,
+    data_path: Path | None = None,
+    URL: str = None,
+    dataset_postfix: str = "",
+    dataset_prefix: str = "",
+) -> None:
+    """Loads the Eunomia datasets in the OMOP Common Data model."""
+    if os.path.exists(data_path):
+        print(f"Path to data exists, load tables from there: {data_path}")
+    else:
+        print("Downloading data...")
+        response = requests.get(URL)
+
+        if response.status_code == 200:
+            # Use zipfile and io to open the ZIP file in memory
+            with zipfile.ZipFile(io.BytesIO(response.content)) as z:
+                # Extract all contents of the ZIP file
+                z.extractall("ehrapy_data")  # Specify the folder where files will be extracted
+                print(f"Download successful. ZIP file downloaded and extracted successfully to {data_path}.")
+        else:
+            print(f"Failed to download the file. Status code: {response.status_code}")
+            return
+
+    return _set_up_duckdb(data_path / dataset_postfix, backend_handle, prefix=dataset_prefix)
+
+
 def mimic_iv_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = None) -> None:
     """Loads the MIMIC-IV demo data in the OMOP Common Data model.
 
-    More details: https://physionet.org/content/mimic-iv-demo-omop/0.9/#files-panel.
+    This function loads the MIMIC-IV demo dataset from its `physionet repository <https://physionet.org/content/mimic-iv-demo-omop/0.9/#files-panel>_` .
+    See also this link for more details.
+
+    DOI https://doi.org/10.13026/2d25-8g07.
 
     Parameters
     ----------
@@ -77,29 +108,19 @@ def mimic_iv_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = N
     if data_path is None:
         data_path = Path("ehrapy_data/mimic-iv-demo-data-in-the-omop-common-data-model-0.9")
 
-    if os.path.exists(data_path):
-        print(f"Path to data exists, load tables from there: {data_path}")
-    else:
-        print("Downloading data...")
-        URL = "https://physionet.org/static/published-projects/mimic-iv-demo-omop/mimic-iv-demo-data-in-the-omop-common-data-model-0.9.zip"
-        response = requests.get(URL)
-
-        if response.status_code == 200:
-            # Use zipfile and io to open the ZIP file in memory
-            with zipfile.ZipFile(io.BytesIO(response.content)) as z:
-                # Extract all contents of the ZIP file
-                z.extractall("ehrapy_data")  # Specify the folder where files will be extracted
-                print(f"Download successful. ZIP file downloaded and extracted successfully to {data_path}.")
-        else:
-            print(f"Failed to download the file. Status code: {response.status_code}")
-            return
-
-    return _set_up_duckdb(data_path / "1_omop_data_csv", backend_handle, prefix="2b_")
+    return _setup_eunomia_datasets(
+        backend_handle,
+        data_path,
+        URL="https://physionet.org/static/published-projects/mimic-iv-demo-omop/mimic-iv-demo-data-in-the-omop-common-data-model-0.9.zip",
+        dataset_postfix="1_omop_data_csv",
+        dataset_prefix="2b_",
+    )
 
 
 def gibleed_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = None) -> None:
     """Loads the GIBleed dataset in the OMOP Common Data model.
 
+    This function loads the GIBleed dataset from the `EunomiaDatasets repository <https://github.com/OHDSI/EunomiaDatasets>_`.
     More details: https://github.com/OHDSI/EunomiaDatasets/tree/main/datasets/GiBleed.
 
     Parameters
@@ -125,30 +146,18 @@ def gibleed_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = No
     if data_path is None:
         data_path = Path("ehrapy_data/GIBleed_dataset")
 
-    if data_path.exists():
-        print(f"Path to data exists, load tables from there: {data_path}")
-    else:
-        print("Downloading data...")
-        URL = "https://github.com/OHDSI/EunomiaDatasets/raw/main/datasets/GiBleed/GiBleed_5.3.zip"
-        response = requests.get(URL)
-
-        if response.status_code == 200:
-            # Use zipfile and io to open the ZIP file in memory
-            with zipfile.ZipFile(io.BytesIO(response.content)) as z:
-                # Extract all contents of the ZIP file into the correct subdirectory
-                z.extractall(data_path)  # Extracting to 'extract_path'
-                print(f"Download successful. ZIP file downloaded and extracted successfully to {data_path}.")
-
-        else:
-            print(f"Failed to download the file. Status code: {response.status_code}")
-            return
-
-    return _set_up_duckdb(data_path / "GiBleed_5.3", backend_handle)
+    return _setup_eunomia_datasets(
+        backend_handle,
+        data_path,
+        URL="https://github.com/OHDSI/EunomiaDatasets/raw/main/datasets/GiBleed/GiBleed_5.3.zip",
+        dataset_postfix="GiBleed_5.3",
+    )
 
 
 def synthea27nj_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = None) -> None:
     """Loads the Synthea27NJ dataset in the OMOP Common Data model.
 
+    This function loads the Synthea27NJ dataset from the `EunomiaDatasets repository <https://github.com/OHDSI/EunomiaDatasets>_`.
     More details: https://github.com/OHDSI/EunomiaDatasets/tree/main/datasets/Synthea27Nj.
 
     Parameters
@@ -174,25 +183,11 @@ def synthea27nj_omop(backend_handle: DuckDBPyConnection, data_path: Path | None
     if data_path is None:
         data_path = Path("ehrapy_data/Synthea27Nj")
 
-    if data_path.exists():
-        print(f"Path to data exists, load tables from there: {data_path}")
-    else:
-        print("Downloading data...")
-        URL = "https://github.com/OHDSI/EunomiaDatasets/raw/main/datasets/Synthea27Nj/Synthea27Nj_5.4.zip"
-        response = requests.get(URL)
-
-        if response.status_code == 200:
-            # Use zipfile and io to open the ZIP file in memory
-            with zipfile.ZipFile(io.BytesIO(response.content)) as z:
-                # Extract all contents of the ZIP file into the correct subdirectory
-                z.extractall(data_path)  # Extracting to 'extract_path'
-                print(f"Download successful. ZIP file downloaded and extracted successfully to {data_path}.")
-
-        else:
-            print(f"Failed to download the file. Status code: {response.status_code}")
-            return
-
-    return _set_up_duckdb(data_path, backend_handle)
+    return _setup_eunomia_datasets(
+        backend_handle,
+        data_path,
+        URL="https://github.com/OHDSI/EunomiaDatasets/raw/main/datasets/Synthea27Nj/Synthea27Nj_5.4.zip",
+    )
 
 
 def mimic_ii(backend_handle: DuckDBPyConnection, data_path: Path | None = None) -> None:

From 3253c1a75d8f4077604f7ca0e93bd147b81b66fc Mon Sep 17 00:00:00 2001
From: Eljas Roellin <eljas.roellin@ikmail.com>
Date: Mon, 4 Nov 2024 12:34:20 +0100
Subject: [PATCH 06/43] fix paths

---
 src/ehrdata/dt/datasets.py | 8 +++++---
 tests/test_dt/test_dt.py   | 3 +++
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/ehrdata/dt/datasets.py b/src/ehrdata/dt/datasets.py
index c5bb1b8..803e546 100644
--- a/src/ehrdata/dt/datasets.py
+++ b/src/ehrdata/dt/datasets.py
@@ -68,8 +68,10 @@ def _setup_eunomia_datasets(
             # Use zipfile and io to open the ZIP file in memory
             with zipfile.ZipFile(io.BytesIO(response.content)) as z:
                 # Extract all contents of the ZIP file
-                z.extractall("ehrapy_data")  # Specify the folder where files will be extracted
-                print(f"Download successful. ZIP file downloaded and extracted successfully to {data_path}.")
+                z.extractall(data_path)  # Specify the folder where files will be extracted
+                print(
+                    f"Download successful. ZIP file downloaded and extracted successfully to {data_path/dataset_postfix}."
+                )
         else:
             print(f"Failed to download the file. Status code: {response.status_code}")
             return
@@ -112,7 +114,7 @@ def mimic_iv_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = N
         backend_handle,
         data_path,
         URL="https://physionet.org/static/published-projects/mimic-iv-demo-omop/mimic-iv-demo-data-in-the-omop-common-data-model-0.9.zip",
-        dataset_postfix="1_omop_data_csv",
+        dataset_postfix="mimic-iv-demo-data-in-the-omop-common-data-model-0.9/1_omop_data_csv",
         dataset_prefix="2b_",
     )
 
diff --git a/tests/test_dt/test_dt.py b/tests/test_dt/test_dt.py
index 219bf35..b78164d 100644
--- a/tests/test_dt/test_dt.py
+++ b/tests/test_dt/test_dt.py
@@ -19,16 +19,19 @@ def duckdb_connection():
 def test_mimic_iv_omop(duckdb_connection):
     ed.dt.mimic_iv_omop(backend_handle=duckdb_connection)
     assert len(duckdb_connection.execute("SHOW TABLES").df()) == 30
+    # sanity check of one table
     assert duckdb_connection.execute("SELECT * FROM person").df().shape == (100, 18)
 
 
 def test_gibleed_omop(duckdb_connection):
     ed.dt.gibleed_omop(backend_handle=duckdb_connection)
     assert len(duckdb_connection.execute("SHOW TABLES").df()) == 36
+    # sanity check of one table
     assert duckdb_connection.execute("SELECT * FROM person").df().shape == (2694, 18)
 
 
 def test_synthea27nj_omop(duckdb_connection):
     ed.dt.synthea27nj_omop(backend_handle=duckdb_connection)
     assert len(duckdb_connection.execute("SHOW TABLES").df()) == 37
+    # sanity check of one table
     assert duckdb_connection.execute("SELECT * FROM person").df().shape == (28, 18)

From c32c47e3e483c6f551178db567e3dcbf043eb4d8 Mon Sep 17 00:00:00 2001
From: Eljas Roellin <eljas.roellin@ikmail.com>
Date: Mon, 4 Nov 2024 17:38:36 +0100
Subject: [PATCH 07/43] towards more datasets

---
 src/ehrdata/dt/datasets.py | 86 ++++++++++++++++++++++++++++++++++++--
 tests/test_dt/test_dt.py   |  4 ++
 2 files changed, 86 insertions(+), 4 deletions(-)

diff --git a/src/ehrdata/dt/datasets.py b/src/ehrdata/dt/datasets.py
index 803e546..dd7b5a3 100644
--- a/src/ehrdata/dt/datasets.py
+++ b/src/ehrdata/dt/datasets.py
@@ -1,11 +1,17 @@
+from __future__ import annotations
+
 import io
 import os
 import zipfile
+from collections.abc import Sequence
 from pathlib import Path
+from typing import TYPE_CHECKING
 
 import requests
 from duckdb.duckdb import DuckDBPyConnection
 
+if TYPE_CHECKING:
+    from ehrdata import EHRData
 from ehrdata.utils._omop_utils import get_table_catalog_dict
 
 
@@ -192,10 +198,82 @@ def synthea27nj_omop(backend_handle: DuckDBPyConnection, data_path: Path | None
     )
 
 
-def mimic_ii(backend_handle: DuckDBPyConnection, data_path: Path | None = None) -> None:
-    """Loads the MIMIC2 dataset"""
-    # TODO: replace mimic_ii as is in ehrapy with its dict-of-table return time - map variables to OMOP?
+def physionet2012(
+    data_path: Path | None = None,
+    interval_length_number: int = 1,
+    interval_length_unit: str = "day",
+    num_intervals: int = 48,
+    aggregation_strategy: str = "last",
+    drop_samples: Sequence[str] = [
+        147514,
+        142731,
+        145611,
+        140501,
+        155655,
+        143656,
+        156254,
+        150309,
+        140936,
+        141264,
+        150649,
+        142998,
+    ],
+) -> EHRData:
+    """Loads the dataset of the `PhysioNet challenge 2012 (v1.0.0) <https://physionet.org/content/challenge-2012/1.0.0/>_`.
+
+    If interval_length_number is 1, interval_length_unit is "day", and num_intervals is 48, this is equivalent to the SAITS preprocessing (insert paper/link/citation).
+    Truncated if a sample has more num_intervals steps; Padded if a sample has less than num_intervals steps.
+    Further, by default the following 12 samples are dropped since they have no time series information at all: 147514, 142731, 145611, 140501, 155655, 143656, 156254, 150309,
+    140936, 141264, 150649, 142998.
+
+    Taken the defaults of interval_length_number, interval_length_unit, num_intervals, and drop_samples, the tensor stored in .r of edata is the same as when doing the PyPOTS <insert citation/link/reference> preprocessing.
+    A simple deviation is that the tensor in ehrdata is of shape n_obs x n_vars x n_intervals (with defaults, 3000x37x48) while the tensor in PyPOTS is of shape n_obs x n_intervals x n_vars (3000x48x37).
+    The tensor stored in .r is hence also fully compatible with the PyPOTS package, as the .r tensor of EHRData objects generally is.
+
+    data_path
+        Path to the raw data. If the path exists, the data is loaded from there. Else, the data is downloaded.
+    interval_length_number
+        Numeric value of the length of one interval.
+    interval_length_unit
+        Unit belonging to the interval length.
+    num_intervals
+        Number of intervals.
+    aggregation_strategy
+        Aggregation strategy for the time series data.
+    drop_samples
+        Samples to drop from the dataset (indicate their RecordID).
+
+    Returns
+    -------
+    Returns a the processed physionet2012 dataset in an EHRData object. The raw data is also downloaded, stored and available under the data_path.
+
+    Examples
+    --------
+        >>> import ehrapy as ep
+        >>> import ehrdata as ed
+        >>> edata = ed.dt.physionet_2012()
+        >>> edata
+    """
+    if data_path is None:
+        data_path = Path("ehrapy_data/physionet2012")
+
+    pass
+    # download data
+    # load data
+    # put a/b/c in obs
+    # put outcomes in obs
+    # put record id in obs
+    # put units to var
+    # put featurenames to var
+    # put time to t
+
+
+def physionet2019():
+    """Loads the dataset of the `PhysioNet challenge 2019 <https://physionet.org/content/challenge-2019/1.0.0/>_`."""
     raise NotImplementedError()
 
 
-# TODO: physionet2012, physionet2019
+def mimic_ii(backend_handle: DuckDBPyConnection, data_path: Path | None = None) -> None:
+    """Loads the MIMIC2 dataset."""
+    # TODO: replace mimic_ii as is in ehrapy with its dict-of-table return time - map variables to OMOP?
+    raise NotImplementedError()
diff --git a/tests/test_dt/test_dt.py b/tests/test_dt/test_dt.py
index b78164d..122484c 100644
--- a/tests/test_dt/test_dt.py
+++ b/tests/test_dt/test_dt.py
@@ -35,3 +35,7 @@ def test_synthea27nj_omop(duckdb_connection):
     assert len(duckdb_connection.execute("SHOW TABLES").df()) == 37
     # sanity check of one table
     assert duckdb_connection.execute("SELECT * FROM person").df().shape == (28, 18)
+
+
+def test_physionet_2012():
+    pass

From 7dc19031f09f72981d69b0c1adc967ac7ab17b91 Mon Sep 17 00:00:00 2001
From: Eljas Roellin <eljas.roellin@ikmail.com>
Date: Wed, 6 Nov 2024 13:24:48 +0100
Subject: [PATCH 08/43] load datasets with ehrapy's download function copied;
 cleaner table extraction

---
 src/ehrdata/dt/dataloader.py | 112 +++++++++++++++++++++++++++++++++++
 src/ehrdata/dt/datasets.py   |  94 +++++++++++++++--------------
 2 files changed, 160 insertions(+), 46 deletions(-)
 create mode 100644 src/ehrdata/dt/dataloader.py

diff --git a/src/ehrdata/dt/dataloader.py b/src/ehrdata/dt/dataloader.py
new file mode 100644
index 0000000..46db42b
--- /dev/null
+++ b/src/ehrdata/dt/dataloader.py
@@ -0,0 +1,112 @@
+from __future__ import annotations
+
+import os
+import shutil
+import tempfile
+from pathlib import Path
+from random import choice
+from string import ascii_lowercase
+from typing import Literal
+
+import requests
+from filelock import FileLock
+from rich import print
+from rich.progress import Progress
+
+
+def download(
+    url: str,
+    archive_format: Literal["zip", "tar", "tar.gz", "tgz"] = None,
+    output_file_name: str = None,
+    output_path: str | Path = None,
+    block_size: int = 1024,
+    overwrite: bool = False,
+) -> None:  # pragma: no cover
+    """Downloads a file irrespective of format.
+
+    Args:
+        url: URL to download.
+        archive_format: The format if an archive file.
+        output_file_name: Name of the downloaded file.
+        output_path: Path to download/extract the files to. Defaults to 'OS tmpdir' if not specified.
+        block_size: Block size for downloads in bytes.
+        overwrite: Whether to overwrite existing files.
+    """
+    if output_file_name is None:
+        letters = ascii_lowercase
+        output_file_name = f"ehrapy_tmp_{''.join(choice(letters) for _ in range(10))}"
+
+    if output_path is None:
+        output_path = tempfile.gettempdir()
+
+    def _sanitize_file_name(file_name):
+        if os.name == "nt":
+            file_name = file_name.replace("?", "_").replace("*", "_")
+        return file_name
+
+    download_to_path = Path(
+        _sanitize_file_name(
+            f"{output_path}{output_file_name}"
+            if str(output_path).endswith("/")
+            else f"{output_path}/{output_file_name}"
+        )
+    )
+
+    Path(output_path).mkdir(parents=True, exist_ok=True)
+    lock_path = f"{download_to_path}.lock"
+    with FileLock(lock_path):
+        if download_to_path.exists():
+            warning = f"[bold red]File {download_to_path} already exists!"
+            if not overwrite:
+                print(warning)
+                return
+            else:
+                print(f"{warning} Overwriting...")
+
+        response = requests.get(url, stream=True)
+        total = int(response.headers.get("content-length", 0))
+
+        temp_file_name = f"{download_to_path}.part"
+
+        with Progress(refresh_per_second=1500) as progress:
+            task = progress.add_task("[red]Downloading...", total=total)
+            with Path(temp_file_name).open("wb") as file:
+                for data in response.iter_content(block_size):
+                    file.write(data)
+                    progress.update(task, advance=block_size)
+
+            # force the progress bar to 100% at the end
+            progress.update(task, completed=total, refresh=True)
+
+            Path(temp_file_name).replace(download_to_path)
+
+        if archive_format:
+            output_path = output_path or tempfile.gettempdir()
+            shutil.unpack_archive(download_to_path, output_path, format=archive_format)
+            download_to_path.unlink()
+            list_of_paths = [path for path in Path(output_path).resolve().glob("*/") if not path.name.startswith(".")]
+            latest_path = max(list_of_paths, key=lambda path: path.stat().st_ctime)
+            shutil.move(latest_path, latest_path.parent / remove_archive_extension(output_file_name))  # type: ignore
+
+    Path(lock_path).unlink(missing_ok=True)
+
+
+def remove_archive_extension(file_path):
+    """Remove the archive extension from the file path."""
+    return (
+        str(Path(file_path).with_suffix(""))
+        if any(
+            Path(file_path).suffix.endswith(ext)
+            for ext in [
+                ".zip",
+                ".tar",
+                ".tar.gz",
+                ".tgz",
+                ".tar.bz2",
+                ".tbz2",
+                ".tar.xz",
+                ".txz",
+            ]
+        )
+        else file_path
+    )
diff --git a/src/ehrdata/dt/datasets.py b/src/ehrdata/dt/datasets.py
index dd7b5a3..96d2cb3 100644
--- a/src/ehrdata/dt/datasets.py
+++ b/src/ehrdata/dt/datasets.py
@@ -1,19 +1,21 @@
 from __future__ import annotations
 
-import io
 import os
-import zipfile
+import shutil
 from collections.abc import Sequence
 from pathlib import Path
 from typing import TYPE_CHECKING
 
-import requests
 from duckdb.duckdb import DuckDBPyConnection
 
+from ehrdata.dt.dataloader import download
+
 if TYPE_CHECKING:
     from ehrdata import EHRData
 from ehrdata.utils._omop_utils import get_table_catalog_dict
 
+DOWNLOAD_VERIFICATION_TAG = "download_verification_tag"
+
 
 def _get_table_list() -> list:
     flat_table_list = []
@@ -45,7 +47,7 @@ def _set_up_duckdb(path: Path, backend_handle: DuckDBPyConnection, prefix: str =
                 file_name_trunk.replace(prefix, ""),
                 backend_handle.read_csv(f"{path}/{file_name}", dtype=dtype),
             )
-        else:
+        elif file_name_trunk != DOWNLOAD_VERIFICATION_TAG:
             unused_files.append(file_name)
 
     for table in tables:
@@ -59,30 +61,26 @@ def _set_up_duckdb(path: Path, backend_handle: DuckDBPyConnection, prefix: str =
 def _setup_eunomia_datasets(
     backend_handle: DuckDBPyConnection,
     data_path: Path | None = None,
-    URL: str = None,
-    dataset_postfix: str = "",
+    data_url: str = None,
+    nested_omop_table_path: str = "",
     dataset_prefix: str = "",
 ) -> None:
     """Loads the Eunomia datasets in the OMOP Common Data model."""
-    if os.path.exists(data_path):
-        print(f"Path to data exists, load tables from there: {data_path}")
-    else:
-        print("Downloading data...")
-        response = requests.get(URL)
-
-        if response.status_code == 200:
-            # Use zipfile and io to open the ZIP file in memory
-            with zipfile.ZipFile(io.BytesIO(response.content)) as z:
-                # Extract all contents of the ZIP file
-                z.extractall(data_path)  # Specify the folder where files will be extracted
-                print(
-                    f"Download successful. ZIP file downloaded and extracted successfully to {data_path/dataset_postfix}."
-                )
-        else:
-            print(f"Failed to download the file. Status code: {response.status_code}")
-            return
-
-    return _set_up_duckdb(data_path / dataset_postfix, backend_handle, prefix=dataset_prefix)
+    download(
+        data_url,
+        archive_format="zip",
+        output_file_name=DOWNLOAD_VERIFICATION_TAG,
+        output_path=data_path,
+    )
+
+    for file_path in (data_path / DOWNLOAD_VERIFICATION_TAG / nested_omop_table_path).glob("*.csv"):
+        shutil.move(file_path, data_path)
+
+    _set_up_duckdb(
+        data_path,
+        backend_handle,
+        prefix=dataset_prefix,
+    )
 
 
 def mimic_iv_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = None) -> None:
@@ -102,7 +100,7 @@ def mimic_iv_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = N
 
     Returns
     -------
-    Returns nothing, but adds the tables to the backend via the handle.
+    Returns nothing, adds the tables to the backend via the handle.
 
     Examples
     --------
@@ -113,20 +111,21 @@ def mimic_iv_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = N
         >>> ed.dt.mimic_iv_omop(backend_handle=con)
         >>> con.execute("SHOW TABLES;").fetchall()
     """
+    data_url = "https://physionet.org/static/published-projects/mimic-iv-demo-omop/mimic-iv-demo-data-in-the-omop-common-data-model-0.9.zip"
     if data_path is None:
         data_path = Path("ehrapy_data/mimic-iv-demo-data-in-the-omop-common-data-model-0.9")
 
-    return _setup_eunomia_datasets(
-        backend_handle,
-        data_path,
-        URL="https://physionet.org/static/published-projects/mimic-iv-demo-omop/mimic-iv-demo-data-in-the-omop-common-data-model-0.9.zip",
-        dataset_postfix="mimic-iv-demo-data-in-the-omop-common-data-model-0.9/1_omop_data_csv",
+    _setup_eunomia_datasets(
+        backend_handle=backend_handle,
+        data_path=data_path,
+        data_url=data_url,
+        nested_omop_table_path="1_omop_data_csv",
         dataset_prefix="2b_",
     )
 
 
 def gibleed_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = None) -> None:
-    """Loads the GIBleed dataset in the OMOP Common Data model.
+    """Loads the GiBleed dataset in the OMOP Common Data model.
 
     This function loads the GIBleed dataset from the `EunomiaDatasets repository <https://github.com/OHDSI/EunomiaDatasets>_`.
     More details: https://github.com/OHDSI/EunomiaDatasets/tree/main/datasets/GiBleed.
@@ -140,7 +139,7 @@ def gibleed_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = No
 
     Returns
     -------
-    Returns nothing, but adds the tables to the backend via the handle.
+    Returns nothing, adds the tables to the backend via the handle.
 
     Examples
     --------
@@ -151,21 +150,22 @@ def gibleed_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = No
         >>> ed.dt.gibleed_omop(backend_handle=con)
         >>> con.execute("SHOW TABLES;").fetchall()
     """
+    data_url = "https://github.com/OHDSI/EunomiaDatasets/raw/main/datasets/GiBleed/GiBleed_5.3.zip"
+
     if data_path is None:
-        data_path = Path("ehrapy_data/GIBleed_dataset")
+        data_path = Path("ehrapy_data/GiBleed")
 
-    return _setup_eunomia_datasets(
-        backend_handle,
-        data_path,
-        URL="https://github.com/OHDSI/EunomiaDatasets/raw/main/datasets/GiBleed/GiBleed_5.3.zip",
-        dataset_postfix="GiBleed_5.3",
+    _setup_eunomia_datasets(
+        backend_handle=backend_handle,
+        data_path=data_path,
+        data_url=data_url,
     )
 
 
 def synthea27nj_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = None) -> None:
-    """Loads the Synthea27NJ dataset in the OMOP Common Data model.
+    """Loads the Synthea27Nj dataset in the OMOP Common Data model.
 
-    This function loads the Synthea27NJ dataset from the `EunomiaDatasets repository <https://github.com/OHDSI/EunomiaDatasets>_`.
+    This function loads the Synthea27Nj dataset from the `EunomiaDatasets repository <https://github.com/OHDSI/EunomiaDatasets>_`.
     More details: https://github.com/OHDSI/EunomiaDatasets/tree/main/datasets/Synthea27Nj.
 
     Parameters
@@ -177,7 +177,7 @@ def synthea27nj_omop(backend_handle: DuckDBPyConnection, data_path: Path | None
 
     Returns
     -------
-    Returns nothing, but adds the tables to the backend via the handle.
+    Returns nothing, adds the tables to the backend via the handle.
 
     Examples
     --------
@@ -188,13 +188,15 @@ def synthea27nj_omop(backend_handle: DuckDBPyConnection, data_path: Path | None
         >>> ed.dt.synthea27nj_omop(backend_handle=con)
         >>> con.execute("SHOW TABLES;").fetchall()
     """
+    data_url = "https://github.com/OHDSI/EunomiaDatasets/raw/main/datasets/Synthea27Nj/Synthea27Nj_5.4.zip"
+
     if data_path is None:
         data_path = Path("ehrapy_data/Synthea27Nj")
 
-    return _setup_eunomia_datasets(
-        backend_handle,
-        data_path,
-        URL="https://github.com/OHDSI/EunomiaDatasets/raw/main/datasets/Synthea27Nj/Synthea27Nj_5.4.zip",
+    _setup_eunomia_datasets(
+        backend_handle=backend_handle,
+        data_path=data_path,
+        data_url=data_url,
     )
 
 

From 3f241fad6ad83abac9f84bc0f8cd960727432c4a Mon Sep 17 00:00:00 2001
From: Eljas Roellin <eljas.roellin@ikmail.com>
Date: Mon, 11 Nov 2024 13:12:02 +0100
Subject: [PATCH 09/43] remove dataloader

---
 src/ehrdata/dt/dataloader.py | 112 -----------------------------------
 1 file changed, 112 deletions(-)
 delete mode 100644 src/ehrdata/dt/dataloader.py

diff --git a/src/ehrdata/dt/dataloader.py b/src/ehrdata/dt/dataloader.py
deleted file mode 100644
index 46db42b..0000000
--- a/src/ehrdata/dt/dataloader.py
+++ /dev/null
@@ -1,112 +0,0 @@
-from __future__ import annotations
-
-import os
-import shutil
-import tempfile
-from pathlib import Path
-from random import choice
-from string import ascii_lowercase
-from typing import Literal
-
-import requests
-from filelock import FileLock
-from rich import print
-from rich.progress import Progress
-
-
-def download(
-    url: str,
-    archive_format: Literal["zip", "tar", "tar.gz", "tgz"] = None,
-    output_file_name: str = None,
-    output_path: str | Path = None,
-    block_size: int = 1024,
-    overwrite: bool = False,
-) -> None:  # pragma: no cover
-    """Downloads a file irrespective of format.
-
-    Args:
-        url: URL to download.
-        archive_format: The format if an archive file.
-        output_file_name: Name of the downloaded file.
-        output_path: Path to download/extract the files to. Defaults to 'OS tmpdir' if not specified.
-        block_size: Block size for downloads in bytes.
-        overwrite: Whether to overwrite existing files.
-    """
-    if output_file_name is None:
-        letters = ascii_lowercase
-        output_file_name = f"ehrapy_tmp_{''.join(choice(letters) for _ in range(10))}"
-
-    if output_path is None:
-        output_path = tempfile.gettempdir()
-
-    def _sanitize_file_name(file_name):
-        if os.name == "nt":
-            file_name = file_name.replace("?", "_").replace("*", "_")
-        return file_name
-
-    download_to_path = Path(
-        _sanitize_file_name(
-            f"{output_path}{output_file_name}"
-            if str(output_path).endswith("/")
-            else f"{output_path}/{output_file_name}"
-        )
-    )
-
-    Path(output_path).mkdir(parents=True, exist_ok=True)
-    lock_path = f"{download_to_path}.lock"
-    with FileLock(lock_path):
-        if download_to_path.exists():
-            warning = f"[bold red]File {download_to_path} already exists!"
-            if not overwrite:
-                print(warning)
-                return
-            else:
-                print(f"{warning} Overwriting...")
-
-        response = requests.get(url, stream=True)
-        total = int(response.headers.get("content-length", 0))
-
-        temp_file_name = f"{download_to_path}.part"
-
-        with Progress(refresh_per_second=1500) as progress:
-            task = progress.add_task("[red]Downloading...", total=total)
-            with Path(temp_file_name).open("wb") as file:
-                for data in response.iter_content(block_size):
-                    file.write(data)
-                    progress.update(task, advance=block_size)
-
-            # force the progress bar to 100% at the end
-            progress.update(task, completed=total, refresh=True)
-
-            Path(temp_file_name).replace(download_to_path)
-
-        if archive_format:
-            output_path = output_path or tempfile.gettempdir()
-            shutil.unpack_archive(download_to_path, output_path, format=archive_format)
-            download_to_path.unlink()
-            list_of_paths = [path for path in Path(output_path).resolve().glob("*/") if not path.name.startswith(".")]
-            latest_path = max(list_of_paths, key=lambda path: path.stat().st_ctime)
-            shutil.move(latest_path, latest_path.parent / remove_archive_extension(output_file_name))  # type: ignore
-
-    Path(lock_path).unlink(missing_ok=True)
-
-
-def remove_archive_extension(file_path):
-    """Remove the archive extension from the file path."""
-    return (
-        str(Path(file_path).with_suffix(""))
-        if any(
-            Path(file_path).suffix.endswith(ext)
-            for ext in [
-                ".zip",
-                ".tar",
-                ".tar.gz",
-                ".tgz",
-                ".tar.bz2",
-                ".tbz2",
-                ".tar.xz",
-                ".txz",
-            ]
-        )
-        else file_path
-    )

From 1f329ec8a172fa8074f2ed8fc0cc43761cef8e64 Mon Sep 17 00:00:00 2001
From: Eljas Roellin <eljas.roellin@ikmail.com>
Date: Mon, 11 Nov 2024 13:19:31 +0100
Subject: [PATCH 10/43] remove physio2012 stubs

---
 src/ehrdata/dt/datasets.py | 73 +-------------------------------------
 tests/test_dt/test_dt.py   |  4 ---
 2 files changed, 1 insertion(+), 76 deletions(-)

diff --git a/src/ehrdata/dt/datasets.py b/src/ehrdata/dt/datasets.py
index 96d2cb3..b762f37 100644
--- a/src/ehrdata/dt/datasets.py
+++ b/src/ehrdata/dt/datasets.py
@@ -2,7 +2,6 @@
 
 import os
 import shutil
-from collections.abc import Sequence
 from pathlib import Path
 from typing import TYPE_CHECKING
 
@@ -11,7 +10,7 @@
 from ehrdata.dt.dataloader import download
 
 if TYPE_CHECKING:
-    from ehrdata import EHRData
+    pass
 from ehrdata.utils._omop_utils import get_table_catalog_dict
 
 DOWNLOAD_VERIFICATION_TAG = "download_verification_tag"
@@ -200,76 +199,6 @@ def synthea27nj_omop(backend_handle: DuckDBPyConnection, data_path: Path | None
     )
 
 
-def physionet2012(
-    data_path: Path | None = None,
-    interval_length_number: int = 1,
-    interval_length_unit: str = "day",
-    num_intervals: int = 48,
-    aggregation_strategy: str = "last",
-    drop_samples: Sequence[str] = [
-        147514,
-        142731,
-        145611,
-        140501,
-        155655,
-        143656,
-        156254,
-        150309,
-        140936,
-        141264,
-        150649,
-        142998,
-    ],
-) -> EHRData:
-    """Loads the dataset of the `PhysioNet challenge 2012 (v1.0.0) <https://physionet.org/content/challenge-2012/1.0.0/>_`.
-
-    If interval_length_number is 1, interval_length_unit is "day", and num_intervals is 48, this is equivalent to the SAITS preprocessing (insert paper/link/citation).
-    Truncated if a sample has more num_intervals steps; Padded if a sample has less than num_intervals steps.
-    Further, by default the following 12 samples are dropped since they have no time series information at all: 147514, 142731, 145611, 140501, 155655, 143656, 156254, 150309,
-    140936, 141264, 150649, 142998.
-
-    Taken the defaults of interval_length_number, interval_length_unit, num_intervals, and drop_samples, the tensor stored in .r of edata is the same as when doing the PyPOTS <insert citation/link/reference> preprocessing.
-    A simple deviation is that the tensor in ehrdata is of shape n_obs x n_vars x n_intervals (with defaults, 3000x37x48) while the tensor in PyPOTS is of shape n_obs x n_intervals x n_vars (3000x48x37).
-    The tensor stored in .r is hence also fully compatible with the PyPOTS package, as the .r tensor of EHRData objects generally is.
-
-    data_path
-        Path to the raw data. If the path exists, the data is loaded from there. Else, the data is downloaded.
-    interval_length_number
-        Numeric value of the length of one interval.
-    interval_length_unit
-        Unit belonging to the interval length.
-    num_intervals
-        Number of intervals.
-    aggregation_strategy
-        Aggregation strategy for the time series data.
-    drop_samples
-        Samples to drop from the dataset (indicate their RecordID).
-
-    Returns
-    -------
-    Returns a the processed physionet2012 dataset in an EHRData object. The raw data is also downloaded, stored and available under the data_path.
-
-    Examples
-    --------
-        >>> import ehrapy as ep
-        >>> import ehrdata as ed
-        >>> edata = ed.dt.physionet_2012()
-        >>> edata
-    """
-    if data_path is None:
-        data_path = Path("ehrapy_data/physionet2012")
-
-    pass
-    # download data
-    # load data
-    # put a/b/c in obs
-    # put outcomes in obs
-    # put record id in obs
-    # put units to var
-    # put featurenames to var
-    # put time to t
-
-
 def physionet2019():
     """Loads the dataset of the `PhysioNet challenge 2019 <https://physionet.org/content/challenge-2019/1.0.0/>_`."""
     raise NotImplementedError()
diff --git a/tests/test_dt/test_dt.py b/tests/test_dt/test_dt.py
index 122484c..b78164d 100644
--- a/tests/test_dt/test_dt.py
+++ b/tests/test_dt/test_dt.py
@@ -35,7 +35,3 @@ def test_synthea27nj_omop(duckdb_connection):
     assert len(duckdb_connection.execute("SHOW TABLES").df()) == 37
     # sanity check of one table
     assert duckdb_connection.execute("SELECT * FROM person").df().shape == (28, 18)
-
-
-def test_physionet_2012():
-    pass

From 3947f3acafd5de2548a26e1464b8ef16fd4df9ca Mon Sep 17 00:00:00 2001
From: Eljas Roellin <eljas.roellin@ikmail.com>
Date: Mon, 11 Nov 2024 19:48:50 +0100
Subject: [PATCH 11/43] switch to logging instead of prints

---
 src/ehrdata/__init__.py       |  4 ++++
 src/ehrdata/dt/dataloader.py  | 10 ++++++----
 src/ehrdata/dt/datasets.py    |  5 +++--
 src/ehrdata/logging_config.py | 10 ++++++++++
 4 files changed, 23 insertions(+), 6 deletions(-)
 create mode 100644 src/ehrdata/logging_config.py

diff --git a/src/ehrdata/__init__.py b/src/ehrdata/__init__.py
index 62c5c45..d657790 100644
--- a/src/ehrdata/__init__.py
+++ b/src/ehrdata/__init__.py
@@ -6,3 +6,7 @@
 __all__ = ["EHRData", "dt", "io", "pl", "pp", "tl"]
 
 __version__ = version("ehrdata")
+
+from .logging_config import configure_logging
+
+configure_logging()
diff --git a/src/ehrdata/dt/dataloader.py b/src/ehrdata/dt/dataloader.py
index bf51f50..a31e568 100644
--- a/src/ehrdata/dt/dataloader.py
+++ b/src/ehrdata/dt/dataloader.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import logging
 import os
 import shutil
 import tempfile
@@ -10,7 +11,8 @@
 
 import requests
 from filelock import FileLock
-from rich import print
+
+# from rich import print
 from rich.progress import Progress
 
 
@@ -56,12 +58,12 @@ def _sanitize_file_name(file_name):
     lock_path = f"{download_to_path}.lock"
     with FileLock(lock_path):
         if _remove_archive_extension(download_to_path).exists():
-            warning = f"[bold red]File {_remove_archive_extension(download_to_path)} already exists!"
+            warning = f"File {_remove_archive_extension(download_to_path)} already exists!"
             if not overwrite:
-                print(warning)
+                logging.info(warning)
                 return
             else:
-                print(f"{warning} Overwriting...")
+                logging.info(f"{warning} Overwriting...")
 
         response = requests.get(url, stream=True)
         total = int(response.headers.get("content-length", 0))
diff --git a/src/ehrdata/dt/datasets.py b/src/ehrdata/dt/datasets.py
index ce1e965..43091ef 100644
--- a/src/ehrdata/dt/datasets.py
+++ b/src/ehrdata/dt/datasets.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import logging
 import os
 import shutil
 from collections.abc import Sequence
@@ -57,8 +58,8 @@ def _set_up_duckdb(path: Path, backend_handle: DuckDBPyConnection, prefix: str =
         if table not in used_tables:
             missing_tables.append(table)
 
-    print("missing tables: ", missing_tables)
-    print("unused files: ", unused_files)
+    logging.info(f"missing tables: {missing_tables}")
+    logging.info(f"unused files: {unused_files}")
 
 
 def _setup_eunomia_datasets(
diff --git a/src/ehrdata/logging_config.py b/src/ehrdata/logging_config.py
new file mode 100644
index 0000000..66ade38
--- /dev/null
+++ b/src/ehrdata/logging_config.py
@@ -0,0 +1,10 @@
+import logging
+
+
+def configure_logging(level=logging.INFO):
+    """Configures logging for the package."""
+    logging.basicConfig(
+        level=level,
+        format="%(levelname)s - %(message)s",
+        force=True,
+    )

From ff0dd740f7e6e71d59dd3370498c29ab46a7effd Mon Sep 17 00:00:00 2001
From: Eljas Roellin <eljas.roellin@ikmail.com>
Date: Mon, 11 Nov 2024 20:18:57 +0100
Subject: [PATCH 12/43] check individual connections to resolve synthea27nj

---
 tests/test_dt/test_dt.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/tests/test_dt/test_dt.py b/tests/test_dt/test_dt.py
index 8f1ca6d..217a854 100644
--- a/tests/test_dt/test_dt.py
+++ b/tests/test_dt/test_dt.py
@@ -17,25 +17,31 @@ def duckdb_connection():
     con.close()
 
 
-def test_mimic_iv_omop(duckdb_connection):
+def test_mimic_iv_omop():
+    duckdb_connection = duckdb.connect()
     ed.dt.mimic_iv_omop(backend_handle=duckdb_connection)
     assert len(duckdb_connection.execute("SHOW TABLES").df()) == 30
     # sanity check of one table
     assert duckdb_connection.execute("SELECT * FROM person").df().shape == (100, 18)
+    duckdb_connection.close()
 
 
-def test_gibleed_omop(duckdb_connection):
+def test_gibleed_omop():
+    duckdb_connection = duckdb.connect()
     ed.dt.gibleed_omop(backend_handle=duckdb_connection)
     assert len(duckdb_connection.execute("SHOW TABLES").df()) == 36
     # sanity check of one table
     assert duckdb_connection.execute("SELECT * FROM person").df().shape == (2694, 18)
+    duckdb_connection.close()
 
 
-def test_synthea27nj_omop(duckdb_connection):
+def test_synthea27nj_omop():
+    duckdb_connection = duckdb.connect()
     ed.dt.synthea27nj_omop(backend_handle=duckdb_connection)
     assert len(duckdb_connection.execute("SHOW TABLES").df()) == 37
     # sanity check of one table
     assert duckdb_connection.execute("SELECT * FROM person").df().shape == (28, 18)
+    duckdb_connection.close()
 
 
 def test_physionet2012():

From 66f232f3634aec794fc5ae19b19031b063048cf6 Mon Sep 17 00:00:00 2001
From: Eljas Roellin <eljas.roellin@ikmail.com>
Date: Mon, 11 Nov 2024 20:35:02 +0100
Subject: [PATCH 13/43] try to empty cache

---
 .github/workflows/test.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index f069c66..68784ef 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -49,6 +49,8 @@ jobs:
           cache-dependency-path: pyproject.toml
       - name: Install dependencies
         run: uv pip install --system ${{ matrix.pip-flags }} ".[dev,test]"
+      - name: Delete pytest cache
+        run: rm -rf .pytest_cache
       - name: Test
         env:
           MPLBACKEND: agg

From 6a2e3ed06405069e7ae6752e537c400b6cec766b Mon Sep 17 00:00:00 2001
From: Eljas Roellin <eljas.roellin@ikmail.com>
Date: Mon, 11 Nov 2024 20:39:48 +0100
Subject: [PATCH 14/43] new test dir, undo workflows.yml

---
 .github/workflows/test.yaml | 2 --
 tests/test_dt/test_dt.py    | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 68784ef..f069c66 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -49,8 +49,6 @@ jobs:
           cache-dependency-path: pyproject.toml
       - name: Install dependencies
         run: uv pip install --system ${{ matrix.pip-flags }} ".[dev,test]"
-      - name: Delete pytest cache
-        run: rm -rf .pytest_cache
       - name: Test
         env:
           MPLBACKEND: agg
diff --git a/tests/test_dt/test_dt.py b/tests/test_dt/test_dt.py
index 217a854..e951f37 100644
--- a/tests/test_dt/test_dt.py
+++ b/tests/test_dt/test_dt.py
@@ -6,7 +6,7 @@
 
 import ehrdata as ed
 
-TEST_DATA_DIR = Path(__file__).parent / "ehrapy_data"
+TEST_DATA_DIR = Path(__file__).parent / "ehrapy_data2"
 
 
 @pytest.fixture(scope="function")

From e1a6abea4607daf353f324a8df5fafea63b297b8 Mon Sep 17 00:00:00 2001
From: Eljas Roellin <eljas.roellin@ikmail.com>
Date: Mon, 11 Nov 2024 20:45:10 +0100
Subject: [PATCH 15/43] try different dir

---
 tests/test_dt/test_dt.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/test_dt/test_dt.py b/tests/test_dt/test_dt.py
index e951f37..ff973be 100644
--- a/tests/test_dt/test_dt.py
+++ b/tests/test_dt/test_dt.py
@@ -19,7 +19,7 @@ def duckdb_connection():
 
 def test_mimic_iv_omop():
     duckdb_connection = duckdb.connect()
-    ed.dt.mimic_iv_omop(backend_handle=duckdb_connection)
+    ed.dt.mimic_iv_omop(data_path=TEST_DATA_DIR, backend_handle=duckdb_connection)
     assert len(duckdb_connection.execute("SHOW TABLES").df()) == 30
     # sanity check of one table
     assert duckdb_connection.execute("SELECT * FROM person").df().shape == (100, 18)
@@ -28,7 +28,7 @@ def test_mimic_iv_omop():
 
 def test_gibleed_omop():
     duckdb_connection = duckdb.connect()
-    ed.dt.gibleed_omop(backend_handle=duckdb_connection)
+    ed.dt.gibleed_omop(data_path=TEST_DATA_DIR, backend_handle=duckdb_connection)
     assert len(duckdb_connection.execute("SHOW TABLES").df()) == 36
     # sanity check of one table
     assert duckdb_connection.execute("SELECT * FROM person").df().shape == (2694, 18)
@@ -37,7 +37,7 @@ def test_gibleed_omop():
 
 def test_synthea27nj_omop():
     duckdb_connection = duckdb.connect()
-    ed.dt.synthea27nj_omop(backend_handle=duckdb_connection)
+    ed.dt.synthea27nj_omop(data_path=TEST_DATA_DIR, backend_handle=duckdb_connection)
     assert len(duckdb_connection.execute("SHOW TABLES").df()) == 37
     # sanity check of one table
     assert duckdb_connection.execute("SELECT * FROM person").df().shape == (28, 18)

From df3827ad0fd70b93a36cc89385d6a50f6d6838c0 Mon Sep 17 00:00:00 2001
From: Eljas Roellin <eljas.roellin@ikmail.com>
Date: Thu, 14 Nov 2024 20:03:20 +0100
Subject: [PATCH 16/43] improve download and datasets

---
 src/ehrdata/dt/dataloader.py | 117 ++++++++++++++---------------------
 src/ehrdata/dt/datasets.py   |  33 +++++-----
 tests/test_dt/test_dt.py     |  16 ++---
 3 files changed, 68 insertions(+), 98 deletions(-)

diff --git a/src/ehrdata/dt/dataloader.py b/src/ehrdata/dt/dataloader.py
index a31e568..201c956 100644
--- a/src/ehrdata/dt/dataloader.py
+++ b/src/ehrdata/dt/dataloader.py
@@ -5,9 +5,6 @@
 import shutil
 import tempfile
 from pathlib import Path
-from random import choice
-from string import ascii_lowercase
-from typing import Literal
 
 import requests
 from filelock import FileLock
@@ -18,9 +15,7 @@
 
 def download(
     url: str,
-    archive_format: Literal["zip", "tar", "tar.gz", "tgz"] = None,
-    output_file_name: str = None,
-    output_path: str | Path = None,
+    saving_path: Path | str,
     block_size: int = 1024,
     overwrite: bool = False,
 ) -> None:  # pragma: no cover
@@ -28,47 +23,51 @@ def download(
 
     Args:
         url: URL to download.
-        archive_format: The format if an archive file.
-        output_file_name: Name of the downloaded file.
-        output_path: Path to download/extract the files to. Defaults to 'OS tmpdir' if not specified.
-        block_size: Block size for downloads in bytes.
-        overwrite: Whether to overwrite existing files.
+        download_path: Where the data should be downloaded to.
     """
-    if output_file_name is None:
-        letters = ascii_lowercase
-        output_file_name = f"ehrapy_tmp_{''.join(choice(letters) for _ in range(10))}"
-
-    if output_path is None:
-        output_path = tempfile.gettempdir()
-
-    def _sanitize_file_name(file_name):
-        if os.name == "nt":
-            file_name = file_name.replace("?", "_").replace("*", "_")
-        return file_name
-
-    download_to_path = Path(
-        _sanitize_file_name(
-            f"{output_path}{output_file_name}"
-            if str(output_path).endswith("/")
-            else f"{output_path}/{output_file_name}"
-        )
-    )
-
-    Path(output_path).mkdir(parents=True, exist_ok=True)
-    lock_path = f"{download_to_path}.lock"
+    # note: tar.gz has to be before gz for the _remove_archive_extension function to remove the entire extension
+    compression_formats = ["tar.gz", "zip", "tar", "gz", "bz", "xz"]
+    raw_formats = ["csv", "txt", "parquet"]
+
+    saving_path = Path(saving_path)
+    # urls can end with "?download"
+    file_name = os.path.basename(url).split("?")[0]
+    suffix = file_name.split(".")[-1]
+
+    def _remove_archive_extension(file_path: str) -> str:
+        for ext in compression_formats:
+            # if the file path ends with extension, remove the extension and the dot before it (hence the -1)
+            if file_path.endswith(ext):
+                return file_path[: -len(ext) - 1]
+        return file_path
+
+    if suffix in raw_formats:
+        raw_data_saving_path = saving_path / file_name
+        path_to_check = raw_data_saving_path
+    elif suffix in compression_formats:
+        tmpdir = tempfile.mkdtemp()
+        raw_data_saving_path = Path(tmpdir) / file_name
+        path_to_check = saving_path / _remove_archive_extension(file_name)
+    else:
+        raise RuntimeError(f"Unknown file format: {suffix}")
+        return
+
+    if path_to_check.exists():
+        info = f"File {path_to_check} already exists!"
+        if not overwrite:
+            logging.info(f"{info} Use downloaded dataset...")
+            return
+        else:
+            logging.info(f"{info} Overwriting...")
+
+    logging.info(f"Downloading {file_name} from {url} to {raw_data_saving_path}")
+
+    lock_path = f"{raw_data_saving_path}.lock"
     with FileLock(lock_path):
-        if _remove_archive_extension(download_to_path).exists():
-            warning = f"File {_remove_archive_extension(download_to_path)} already exists!"
-            if not overwrite:
-                logging.info(warning)
-                return
-            else:
-                logging.info(f"{warning} Overwriting...")
-
         response = requests.get(url, stream=True)
         total = int(response.headers.get("content-length", 0))
 
-        temp_file_name = f"{download_to_path}.part"
+        temp_file_name = f"{raw_data_saving_path}.part"
 
         with Progress(refresh_per_second=1500) as progress:
             task = progress.add_task("[red]Downloading...", total=total)
@@ -80,34 +79,12 @@ def _sanitize_file_name(file_name):
             # force the progress bar to 100% at the end
             progress.update(task, completed=total, refresh=True)
 
-            Path(temp_file_name).replace(download_to_path)
+            Path(temp_file_name).replace(raw_data_saving_path)
 
-        if archive_format:
-            output_path = output_path or tempfile.gettempdir()
-            shutil.unpack_archive(download_to_path, output_path, format=archive_format)
-            download_to_path.unlink()
-            list_of_paths = [path for path in Path(output_path).resolve().glob("*/") if not path.name.startswith(".")]
-            latest_path = max(list_of_paths, key=lambda path: path.stat().st_ctime)
-            shutil.move(
-                latest_path,
-                latest_path.parent / _remove_archive_extension(output_file_name),
-            )  # type: ignore
+        if suffix in compression_formats:
+            shutil.unpack_archive(raw_data_saving_path, saving_path)
+            logging.info(
+                f"Extracted archive {file_name} from {raw_data_saving_path} to {saving_path / _remove_archive_extension(file_name)}"
+            )
 
     Path(lock_path).unlink(missing_ok=True)
-
-
-def _remove_archive_extension(file_path):
-    path = Path(file_path)
-    for ext in [
-        ".tar.gz",
-        ".tgz",
-        ".tar.bz2",
-        ".tbz2",
-        ".tar.xz",
-        ".txz",
-        ".zip",
-        ".tar",
-    ]:
-        if str(path).endswith(ext):
-            return Path(str(path)[: -len(ext)])
-    return Path(path)
diff --git a/src/ehrdata/dt/datasets.py b/src/ehrdata/dt/datasets.py
index 43091ef..38b222a 100644
--- a/src/ehrdata/dt/datasets.py
+++ b/src/ehrdata/dt/datasets.py
@@ -63,22 +63,21 @@ def _set_up_duckdb(path: Path, backend_handle: DuckDBPyConnection, prefix: str =
 
 
 def _setup_eunomia_datasets(
+    data_url: str,
     backend_handle: DuckDBPyConnection,
     data_path: Path | None = None,
-    data_url: str = None,
-    nested_omop_table_path: str = "",
+    nested_omop_tables_folder: str = None,
     dataset_prefix: str = "",
 ) -> None:
     """Loads the Eunomia datasets in the OMOP Common Data model."""
     download(
         data_url,
-        archive_format="zip",
-        output_file_name=DOWNLOAD_VERIFICATION_TAG,
-        output_path=data_path,
+        saving_path=data_path,
     )
 
-    for file_path in (data_path / DOWNLOAD_VERIFICATION_TAG / nested_omop_table_path).glob("*.csv"):
-        shutil.move(file_path, data_path)
+    if nested_omop_tables_folder:
+        for file_path in (data_path / nested_omop_tables_folder).glob("*.csv"):
+            shutil.move(file_path, data_path)
 
     _set_up_duckdb(
         data_path,
@@ -120,10 +119,10 @@ def mimic_iv_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = N
         data_path = Path("ehrapy_data/mimic-iv-demo-data-in-the-omop-common-data-model-0.9")
 
     _setup_eunomia_datasets(
+        data_url=data_url,
         backend_handle=backend_handle,
         data_path=data_path,
-        data_url=data_url,
-        nested_omop_table_path="1_omop_data_csv",
+        nested_omop_tables_folder="mimic-iv-demo-data-in-the-omop-common-data-model-0.9/1_omop_data_csv",
         dataset_prefix="2b_",
     )
 
@@ -157,12 +156,13 @@ def gibleed_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = No
     data_url = "https://github.com/OHDSI/EunomiaDatasets/raw/main/datasets/GiBleed/GiBleed_5.3.zip"
 
     if data_path is None:
-        data_path = Path("ehrapy_data/GiBleed")
+        data_path = Path("ehrapy_data/GiBleed_5.3")
 
     _setup_eunomia_datasets(
+        data_url=data_url,
         backend_handle=backend_handle,
         data_path=data_path,
-        data_url=data_url,
+        nested_omop_tables_folder="GiBleed_5.3",
     )
 
 
@@ -195,12 +195,12 @@ def synthea27nj_omop(backend_handle: DuckDBPyConnection, data_path: Path | None
     data_url = "https://github.com/OHDSI/EunomiaDatasets/raw/main/datasets/Synthea27Nj/Synthea27Nj_5.4.zip"
 
     if data_path is None:
-        data_path = Path("ehrapy_data/Synthea27Nj")
+        data_path = Path("ehrapy_data/Synthea27Nj_5.4")
 
     _setup_eunomia_datasets(
+        data_url=data_url,
         backend_handle=backend_handle,
         data_path=data_path,
-        data_url=data_url,
     )
 
 
@@ -289,16 +289,13 @@ def physionet2012(
     for file_name in temp_data_set_names:
         download(
             url=f"https://physionet.org/files/challenge-2012/1.0.0/{file_name}.tar.gz?download",
-            output_path=data_path,
-            output_file_name=file_name + ".tar.gz",
-            archive_format="gztar",
+            saving_path=data_path,
         )
 
     for file_name in outcome_file_names:
         download(
             url=f"https://physionet.org/files/challenge-2012/1.0.0/{file_name}?download",
-            output_path=data_path,
-            output_file_name=file_name,
+            saving_path=data_path,
         )
 
     static_features = ["Age", "Gender", "ICUType", "Height"]
diff --git a/tests/test_dt/test_dt.py b/tests/test_dt/test_dt.py
index ff973be..02fb030 100644
--- a/tests/test_dt/test_dt.py
+++ b/tests/test_dt/test_dt.py
@@ -1,13 +1,9 @@
-from pathlib import Path
-
 import duckdb
 import numpy as np
 import pytest
 
 import ehrdata as ed
 
-TEST_DATA_DIR = Path(__file__).parent / "ehrapy_data2"
-
 
 @pytest.fixture(scope="function")
 def duckdb_connection():
@@ -17,27 +13,27 @@ def duckdb_connection():
     con.close()
 
 
-def test_mimic_iv_omop():
+def test_mimic_iv_omop(tmp_path):
     duckdb_connection = duckdb.connect()
-    ed.dt.mimic_iv_omop(data_path=TEST_DATA_DIR, backend_handle=duckdb_connection)
+    ed.dt.mimic_iv_omop(data_path=tmp_path, backend_handle=duckdb_connection)
     assert len(duckdb_connection.execute("SHOW TABLES").df()) == 30
     # sanity check of one table
     assert duckdb_connection.execute("SELECT * FROM person").df().shape == (100, 18)
     duckdb_connection.close()
 
 
-def test_gibleed_omop():
+def test_gibleed_omop(tmp_path):
     duckdb_connection = duckdb.connect()
-    ed.dt.gibleed_omop(data_path=TEST_DATA_DIR, backend_handle=duckdb_connection)
+    ed.dt.gibleed_omop(data_path=tmp_path, backend_handle=duckdb_connection)
     assert len(duckdb_connection.execute("SHOW TABLES").df()) == 36
     # sanity check of one table
     assert duckdb_connection.execute("SELECT * FROM person").df().shape == (2694, 18)
     duckdb_connection.close()
 
 
-def test_synthea27nj_omop():
+def test_synthea27nj_omop(tmp_path):
     duckdb_connection = duckdb.connect()
-    ed.dt.synthea27nj_omop(data_path=TEST_DATA_DIR, backend_handle=duckdb_connection)
+    ed.dt.synthea27nj_omop(data_path=tmp_path, backend_handle=duckdb_connection)
     assert len(duckdb_connection.execute("SHOW TABLES").df()) == 37
     # sanity check of one table
     assert duckdb_connection.execute("SELECT * FROM person").df().shape == (28, 18)

From 45f388ed8b47ea440b503348881b69eb168ea9fc Mon Sep 17 00:00:00 2001
From: Eljas Roellin <eljas.roellin@ikmail.com>
Date: Fri, 15 Nov 2024 11:47:34 +0100
Subject: [PATCH 17/43] fix pandas warning

---
 src/ehrdata/dt/datasets.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/ehrdata/dt/datasets.py b/src/ehrdata/dt/datasets.py
index 38b222a..3a1eb54 100644
--- a/src/ehrdata/dt/datasets.py
+++ b/src/ehrdata/dt/datasets.py
@@ -49,7 +49,7 @@ def _set_up_duckdb(path: Path, backend_handle: DuckDBPyConnection, prefix: str =
 
             backend_handle.register(
                 file_name_trunk.replace(prefix, ""),
-                backend_handle.read_csv(f"{path}/{file_name}", dtype=dtype),
+                backend_handle.read_csv(f"{path}/{file_name}", dtype=dtype, delim=","),
             )
         elif file_name_trunk != DOWNLOAD_VERIFICATION_TAG:
             unused_files.append(file_name)
@@ -352,7 +352,7 @@ def physionet2012(
     df_long_time_seconds = np.array(pd.to_timedelta(df_long["Time"] + ":00").dt.total_seconds())
     interval_df_interval_end_offset_seconds = np.array(interval_df["interval_end_offset"].dt.total_seconds())
     df_long_interval_step = np.argmax(df_long_time_seconds[:, None] <= interval_df_interval_end_offset_seconds, axis=1)
-    df_long["interval_step"] = df_long_interval_step
+    df_long.loc[:, ["interval_step"]] = df_long_interval_step
 
     # if one person for one feature (=Parameter) within one interval_step has multiple measurements, decide which one to keep
     df_long = df_long.drop_duplicates(subset=["RecordID", "Parameter", "interval_step"], keep=aggregation_strategy)

From f4df4ffc2b2ff0e0a7f4dfe4db61f35bf877a18b Mon Sep 17 00:00:00 2001
From: Eljas Roellin <eljas.roellin@ikmail.com>
Date: Fri, 15 Nov 2024 11:48:57 +0100
Subject: [PATCH 18/43] use , delimiter in duckdb

---
 src/ehrdata/dt/datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ehrdata/dt/datasets.py b/src/ehrdata/dt/datasets.py
index 3a1eb54..c6dc72a 100644
--- a/src/ehrdata/dt/datasets.py
+++ b/src/ehrdata/dt/datasets.py
@@ -49,7 +49,7 @@ def _set_up_duckdb(path: Path, backend_handle: DuckDBPyConnection, prefix: str =
 
             backend_handle.register(
                 file_name_trunk.replace(prefix, ""),
-                backend_handle.read_csv(f"{path}/{file_name}", dtype=dtype, delim=","),
+                backend_handle.read_csv(f"{path}/{file_name}", dtype=dtype, delimiter=","),
             )
         elif file_name_trunk != DOWNLOAD_VERIFICATION_TAG:
             unused_files.append(file_name)

From bbc663e31b995a90355c24e998ca961dd38470ab Mon Sep 17 00:00:00 2001
From: Eljas Roellin <eljas.roellin@ikmail.com>
Date: Fri, 15 Nov 2024 11:52:15 +0100
Subject: [PATCH 19/43] remove extract's from api.md

---
 docs/api.md                 | 10 ----------
 src/ehrdata/io/omop/omop.py | 11 -----------
 2 files changed, 21 deletions(-)

diff --git a/docs/api.md b/docs/api.md
index 12fb317..2dc62aa 100644
--- a/docs/api.md
+++ b/docs/api.md
@@ -24,16 +24,6 @@
     io.omop.setup_variables
     io.omop.get_time_interval_table
     io.omop.load
-    io.omop.extract_person
-    io.omop.extract_observation_period
-    io.omop.extract_measurement
-    io.omop.extract_observation
-    io.omop.extract_procedure_occurrence
-    io.omop.extract_specimen
-    io.omop.extract_device_exposure
-    io.omop.extract_drug_exposure
-    io.omop.extract_condition_occurrence
-    io.omop.extract_note
 ```
 
 ## Datasets
diff --git a/src/ehrdata/io/omop/omop.py b/src/ehrdata/io/omop/omop.py
index 6034b17..f9b460f 100644
--- a/src/ehrdata/io/omop/omop.py
+++ b/src/ehrdata/io/omop/omop.py
@@ -460,17 +460,6 @@ def extract_procedure_occurrence(duckdb_instance):
     )
 
 
-def extract_specimen(duckdb_instance):
-    """Extract a table of an OMOP CDM Database."""
-    return get_table(
-        duckdb_instance,
-        table_name="specimen",
-        concept_id_col="specimen_concept_id",
-        value_col="unit_concept_id",  # Assuming `unit_concept_id` is a suitable value field
-        timestamp_col="specimen_datetime",
-    )
-
-
 def extract_device_exposure(duckdb_instance):
     """Extract a table of an OMOP CDM Database."""
     # return get_table(

From 839ff091bda2251f66bfb4f93e437d591814cdd3 Mon Sep 17 00:00:00 2001
From: Eljas Roellin <eljas.roellin@ikmail.com>
Date: Fri, 15 Nov 2024 12:22:22 +0100
Subject: [PATCH 20/43] read w/ pandas instead of duckdb

---
 src/ehrdata/dt/datasets.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/ehrdata/dt/datasets.py b/src/ehrdata/dt/datasets.py
index c6dc72a..23dac96 100644
--- a/src/ehrdata/dt/datasets.py
+++ b/src/ehrdata/dt/datasets.py
@@ -47,10 +47,12 @@ def _set_up_duckdb(path: Path, backend_handle: DuckDBPyConnection, prefix: str =
             else:
                 dtype = None
 
-            backend_handle.register(
-                file_name_trunk.replace(prefix, ""),
-                backend_handle.read_csv(f"{path}/{file_name}", dtype=dtype, delimiter=","),
-            )
+            df = pd.read_csv(f"{path}/{file_name}", dtype=dtype)  # noqa: F841
+            backend_handle.execute(f"CREATE TABLE {file_name_trunk.replace(prefix, '')} AS SELECT * FROM df")
+            # backend_handle.register(
+            #     file_name_trunk.replace(prefix, ""),
+            #     backend_handle.read_csv(f"{path}/{file_name}", dtype=dtype, delimiter=","),
+            # )
         elif file_name_trunk != DOWNLOAD_VERIFICATION_TAG:
             unused_files.append(file_name)
 

From dafda91e1daee62015fa2fb7665697a33810d0a4 Mon Sep 17 00:00:00 2001
From: Eljas Roellin <eljas.roellin@ikmail.com>
Date: Sun, 17 Nov 2024 18:59:28 +0100
Subject: [PATCH 21/43] support different capitalizations; check if pre-release
 fail again..?

---
 src/ehrdata/dt/datasets.py                    |  66 +++------
 src/ehrdata/io/omop/__init__.py               |   3 +-
 src/ehrdata/io/omop/omop.py                   | 129 ++++++++++++------
 tests/conftest.py                             |  12 +-
 .../toy_omop/capital_letters/MEASUREMENT.csv  |   2 +
 .../capital_letters/OBSERVATION_PERIOD.csv    |   2 +
 .../data/toy_omop/capital_letters/PERSON.csv  |   2 +
 tests/test_io/test_omop.py                    |  27 ++++
 8 files changed, 146 insertions(+), 97 deletions(-)
 create mode 100644 tests/data/toy_omop/capital_letters/MEASUREMENT.csv
 create mode 100644 tests/data/toy_omop/capital_letters/OBSERVATION_PERIOD.csv
 create mode 100644 tests/data/toy_omop/capital_letters/PERSON.csv

diff --git a/src/ehrdata/dt/datasets.py b/src/ehrdata/dt/datasets.py
index 23dac96..152a3eb 100644
--- a/src/ehrdata/dt/datasets.py
+++ b/src/ehrdata/dt/datasets.py
@@ -1,7 +1,5 @@
 from __future__ import annotations
 
-import logging
-import os
 import shutil
 from collections.abc import Sequence
 from pathlib import Path
@@ -12,56 +10,18 @@
 from duckdb.duckdb import DuckDBPyConnection
 
 from ehrdata.dt.dataloader import download
+from ehrdata.io.omop import setup_connection
 from ehrdata.io.omop._queries import _generate_timedeltas
-from ehrdata.utils._omop_utils import get_table_catalog_dict
 
 if TYPE_CHECKING:
     from ehrdata import EHRData
 
-DOWNLOAD_VERIFICATION_TAG = "download_verification_tag"
 
-
-def _get_table_list() -> list:
-    flat_table_list = []
-    for _, value_list in get_table_catalog_dict().items():
-        for value in value_list:
-            flat_table_list.append(value)
-    return flat_table_list
-
-
-def _set_up_duckdb(path: Path, backend_handle: DuckDBPyConnection, prefix: str = "") -> None:
-    """Create tables in the backend from the CSV files in the path from datasets in the OMOP Common Data model."""
-    tables = _get_table_list()
-
-    used_tables = []
-    missing_tables = []
-    unused_files = []
-    for file_name in os.listdir(path):
-        file_name_trunk = file_name.split(".")[0].lower()
-
-        if file_name_trunk in tables or file_name_trunk.replace(prefix, "") in tables:
-            used_tables.append(file_name_trunk.replace(prefix, ""))
-
-            if file_name_trunk == "measurement":
-                dtype = {"measurement_source_value": str}
-            else:
-                dtype = None
-
-            df = pd.read_csv(f"{path}/{file_name}", dtype=dtype)  # noqa: F841
-            backend_handle.execute(f"CREATE TABLE {file_name_trunk.replace(prefix, '')} AS SELECT * FROM df")
-            # backend_handle.register(
-            #     file_name_trunk.replace(prefix, ""),
-            #     backend_handle.read_csv(f"{path}/{file_name}", dtype=dtype, delimiter=","),
-            # )
-        elif file_name_trunk != DOWNLOAD_VERIFICATION_TAG:
-            unused_files.append(file_name)
-
-    for table in tables:
-        if table not in used_tables:
-            missing_tables.append(table)
-
-    logging.info(f"missing tables: {missing_tables}")
-    logging.info(f"unused files: {unused_files}")
+COLUMN_CASE = {
+    "uppercase": "uppercase",
+    "lowercase": "lowercase",
+    "titlecase": "titlecase",
+}
 
 
 def _setup_eunomia_datasets(
@@ -81,11 +41,12 @@ def _setup_eunomia_datasets(
         for file_path in (data_path / nested_omop_tables_folder).glob("*.csv"):
             shutil.move(file_path, data_path)
 
-    _set_up_duckdb(
+    edata = setup_connection(
         data_path,
         backend_handle,
         prefix=dataset_prefix,
     )
+    return edata
 
 
 def mimic_iv_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = None) -> None:
@@ -120,13 +81,14 @@ def mimic_iv_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = N
     if data_path is None:
         data_path = Path("ehrapy_data/mimic-iv-demo-data-in-the-omop-common-data-model-0.9")
 
-    _setup_eunomia_datasets(
+    edata = _setup_eunomia_datasets(
         data_url=data_url,
         backend_handle=backend_handle,
         data_path=data_path,
         nested_omop_tables_folder="mimic-iv-demo-data-in-the-omop-common-data-model-0.9/1_omop_data_csv",
         dataset_prefix="2b_",
     )
+    return edata
 
 
 def gibleed_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = None) -> None:
@@ -160,13 +122,15 @@ def gibleed_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = No
     if data_path is None:
         data_path = Path("ehrapy_data/GiBleed_5.3")
 
-    _setup_eunomia_datasets(
+    edata = _setup_eunomia_datasets(
         data_url=data_url,
         backend_handle=backend_handle,
         data_path=data_path,
         nested_omop_tables_folder="GiBleed_5.3",
     )
 
+    return edata
+
 
 def synthea27nj_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = None) -> None:
     """Loads the Synthea27Nj dataset in the OMOP Common Data model.
@@ -199,12 +163,14 @@ def synthea27nj_omop(backend_handle: DuckDBPyConnection, data_path: Path | None
     if data_path is None:
         data_path = Path("ehrapy_data/Synthea27Nj_5.4")
 
-    _setup_eunomia_datasets(
+    edata = _setup_eunomia_datasets(
         data_url=data_url,
         backend_handle=backend_handle,
         data_path=data_path,
     )
 
+    return edata
+
 
 def mimic_ii(backend_handle: DuckDBPyConnection, data_path: Path | None = None) -> None:
     """Loads the MIMIC2 dataset."""
diff --git a/src/ehrdata/io/omop/__init__.py b/src/ehrdata/io/omop/__init__.py
index 8cd4668..6fb9860 100644
--- a/src/ehrdata/io/omop/__init__.py
+++ b/src/ehrdata/io/omop/__init__.py
@@ -1,7 +1,6 @@
 from .omop import (
     get_table,
     get_time_interval_table,
-    load,
     # extract_condition_occurrence,
     # extract_device_exposure,
     # extract_drug_exposure,
@@ -13,7 +12,7 @@
     # extract_person_observation_period,
     # extract_procedure_occurrence,
     # extract_specimen,
-    register_omop_to_db_connection,
+    setup_connection,
     setup_obs,
     setup_variables,
 )
diff --git a/src/ehrdata/io/omop/omop.py b/src/ehrdata/io/omop/omop.py
index f9b460f..3d6efb6 100644
--- a/src/ehrdata/io/omop/omop.py
+++ b/src/ehrdata/io/omop/omop.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import logging
 import os
 from collections.abc import Sequence
 from pathlib import Path
@@ -9,24 +10,78 @@
 import duckdb
 import numpy as np
 import pandas as pd
+from duckdb.duckdb import DuckDBPyConnection
 
 from ehrdata.io.omop._queries import (
     AGGREGATION_STRATEGY_KEY,
     time_interval_table_query_long_format,
 )
-from ehrdata.utils._omop_utils import get_omop_table_names
+from ehrdata.utils._omop_utils import get_table_catalog_dict
+
+DOWNLOAD_VERIFICATION_TAG = "download_verification_tag"
 
 VALID_OBSERVATION_TABLES_SINGLE = ["person"]
 VALID_OBSERVATION_TABLES_JOIN = ["person_cohort", "person_observation_period", "person_visit_occurrence"]
 VALID_VARIABLE_TABLES = ["measurement", "observation", "specimen"]
 
 
-def _check_sanity_of_folder(folder_path: str | Path):
-    pass
+def _get_table_list() -> list:
+    flat_table_list = []
+    for _, value_list in get_table_catalog_dict().items():
+        for value in value_list:
+            flat_table_list.append(value)
+    return flat_table_list
 
 
-def _check_sanity_of_database(backend_handle: duckdb.DuckDB):
-    pass
+def _set_up_duckdb(path: Path, backend_handle: DuckDBPyConnection, prefix: str = "") -> str:
+    """Create tables in the backend from the CSV files in the path from datasets in the OMOP Common Data model."""
+    tables = _get_table_list()
+
+    used_tables = []
+    missing_tables = []
+    unused_files = []
+    for file_name in os.listdir(path):
+        file_name_trunk = file_name.split(".")[0].lower()
+        regular_omop_table_name = file_name_trunk.replace(prefix, "")
+
+        if regular_omop_table_name in tables:
+            used_tables.append(regular_omop_table_name)
+
+            if regular_omop_table_name == "measurement":
+                dtype = {"measurement_source_value": str}
+            else:
+                dtype = None
+
+            # read raw csv as temporary table
+            temp_relation = backend_handle.read_csv(path / file_name, dtype=dtype)  # noqa: F841
+            backend_handle.execute("CREATE OR REPLACE TABLE temp_table AS SELECT * FROM temp_relation")
+
+            # make query to create table with lowercase column names
+            column_names = backend_handle.execute("DESCRIBE temp_table").df()["column_name"].values
+            select_columns = ", ".join([f'"{col}" AS "{col.lower()}"' for col in column_names])
+            create_table_with_lowercase_columns_query = (
+                f"CREATE TABLE {regular_omop_table_name} AS SELECT {select_columns} FROM temp_table"
+            )
+
+            # write proper table
+            existing_tables = backend_handle.execute("SHOW TABLES").df()["name"].values
+            if regular_omop_table_name in existing_tables:
+                logging.info(f"Table {regular_omop_table_name} already exists. Dropping and recreating...")
+                backend_handle.execute(f"DROP TABLE {regular_omop_table_name}")
+
+            backend_handle.execute(create_table_with_lowercase_columns_query)
+
+            backend_handle.execute("DROP TABLE temp_table")
+
+        elif file_name_trunk != DOWNLOAD_VERIFICATION_TAG:
+            unused_files.append(file_name)
+
+    for table in tables:
+        if table not in used_tables:
+            missing_tables.append(table)
+
+    logging.info(f"missing tables: {missing_tables}")
+    logging.info(f"unused files: {unused_files}")
 
 
 def _check_valid_backend_handle(backend_handle) -> None:
@@ -182,28 +237,28 @@ def _create_enriched_var_with_unit_info(backend_handle, ds, var, unit_report) ->
     return feature_concept_id_unit_info_table
 
 
-def register_omop_to_db_connection(
-    path: Path,
-    backend_handle: duckdb.duckdb.DuckDBPyConnection,
-    source: Literal["csv"] = "csv",
-) -> None:
-    """Register the OMOP CDM tables to the database."""
-    missing_tables = []
-    for table in get_omop_table_names():
-        # if path exists lowercse, uppercase, capitalized:
-        table_path = f"{path}/{table}.csv"
-        if os.path.exists(table_path):
-            if table == "measurement":
-                backend_handle.register(
-                    table, backend_handle.read_csv(f"{path}/{table}.csv", dtype={"measurement_source_value": str})
-                )
-            else:
-                backend_handle.register(table, backend_handle.read_csv(f"{path}/{table}.csv"))
-        else:
-            missing_tables.append([table])
-    print("missing tables: ", missing_tables)
+def setup_connection(path: Path | str, backend_handle: DuckDBPyConnection, prefix: str = "") -> None:
+    """Setup a connection to the OMOP CDM database.
 
-    return None
+    This function sets up a connection to the OMOP CDM database.
+    It checks the capitalization of the 'person' table, and assumes the same capitalization style is used for all other tables.
+
+
+    Parameters
+    ----------
+    path
+        The path to the folder containing the CSV files.
+    backend_handle
+        The backend handle to the database.
+    prefix
+        The prefix to be removed from the CSV filenames.
+
+    Returns
+    -------
+    An EHRData object with populated .uns["omop_table_capitalization"] field.
+
+    """
+    _set_up_duckdb(Path(path), backend_handle, prefix)
 
 
 def setup_obs(
@@ -326,7 +381,7 @@ def setup_variables(
     if time_defining_table is None:
         raise ValueError("The observation table must be set up first, use the `setup_obs` function.")
 
-    if data_tables[0] in ["measurement", "observation"]:
+    if data_tables[0] in ["measurement", "observation", "specimen"]:
         # also keep unit_concept_id and unit_source_value;
         if isinstance(data_field_to_keep, list):
             data_field_to_keep = list(data_field_to_keep) + ["unit_concept_id", "unit_source_value"]
@@ -359,7 +414,10 @@ def setup_variables(
     unit_report = _create_feature_unit_concept_id_report(backend_handle, ds)
 
     var = ds["data_table_concept_id"].to_dataframe()
-    concepts = backend_handle.sql("SELECT * FROM concept").df()
+
+    if enrich_var_with_feature_info or enrich_var_with_unit_info:
+        concepts = backend_handle.sql("SELECT * FROM concept").df()
+        concepts.columns = concepts.columns.str.lower()
 
     if enrich_var_with_feature_info:
         var = pd.merge(var, concepts, how="left", left_index=True, right_on="concept_id")
@@ -393,21 +451,6 @@ def setup_variables(
     return edata
 
 
-def load(
-    backend_handle: Literal[str, duckdb, Path],
-    # folder_path: str,
-    # delimiter: str = ",",
-    # make_filename_lowercase: bool = True,
-) -> None:
-    """Initialize a connection to the OMOP CDM Database."""
-    if isinstance(backend_handle, str) or isinstance(backend_handle, Path):
-        _check_sanity_of_folder(backend_handle)
-    elif isinstance(backend_handle, duckdb.DuckDB):
-        _check_sanity_of_database(backend_handle)
-    else:
-        raise NotImplementedError(f"Backend {backend_handle} not supported. Choose a valid backend.")
-
-
 def get_table(duckdb_instance, table_name: str) -> pd.DataFrame:
     """Extract a table of an OMOP CDM Database."""
     return _lowercase_column_names(duckdb_instance.sql(f"SELECT * FROM {table_name}").df())
diff --git a/tests/conftest.py b/tests/conftest.py
index 8f5fbc0..baf5e94 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,12 +1,20 @@
 import duckdb
 import pytest
 
-from ehrdata.io.omop import register_omop_to_db_connection
+from ehrdata.io.omop import setup_connection
 
 
 @pytest.fixture  # (scope="session")
 def omop_connection_vanilla():
     con = duckdb.connect()
-    register_omop_to_db_connection(path="tests/data/toy_omop/vanilla", backend_handle=con, source="csv")
+    setup_connection(path="tests/data/toy_omop/vanilla", backend_handle=con)
+    yield con
+    con.close()
+
+
+@pytest.fixture  # (scope="session")
+def omop_connection_capital_letters():
+    con = duckdb.connect()
+    setup_connection(path="tests/data/toy_omop/capital_letters", backend_handle=con)
     yield con
     con.close()
diff --git a/tests/data/toy_omop/capital_letters/MEASUREMENT.csv b/tests/data/toy_omop/capital_letters/MEASUREMENT.csv
new file mode 100644
index 0000000..5a548ac
--- /dev/null
+++ b/tests/data/toy_omop/capital_letters/MEASUREMENT.csv
@@ -0,0 +1,2 @@
+﻿MEASUREMENT_ID,PERSON_ID,MEASUREMENT_CONCEPT_ID,MEASUREMENT_DATE,MEASUREMENT_DATETIME,MEASUREMENT_TIME,MEASUREMENT_TYPE_CONCEPT_ID,OPERATOR_CONCEPT_ID,VALUE_AS_NUMBER,VALUE_AS_CONCEPT_ID,UNIT_CONCEPT_ID,RANGE_LOW,RANGE_HIGH,PROVIDER_ID,VISIT_OCCURRENCE_ID,VISIT_DETAIL_ID,MEASUREMENT_SOURCE_VALUE,MEASUREMENT_SOURCE_CONCEPT_ID,UNIT_SOURCE_VALUE,VALUE_SOURCE_VALUE
+1,1,3031147,2100-01-01,2100-01-01 12:00:00,12:00:00,32856,,18,,9557,21,30,,1,,50804,2000001003,mEq/L,18
diff --git a/tests/data/toy_omop/capital_letters/OBSERVATION_PERIOD.csv b/tests/data/toy_omop/capital_letters/OBSERVATION_PERIOD.csv
new file mode 100644
index 0000000..04d6e15
--- /dev/null
+++ b/tests/data/toy_omop/capital_letters/OBSERVATION_PERIOD.csv
@@ -0,0 +1,2 @@
+OBSERVATION_PERIOD_ID,PERSON_ID,OBSERVATION_PERIOD_START_DATE,OBSERVATION_PERIOD_END_DATE,PERIOD_TYPE_CONCEPT_ID
+1,1,2100-01-01,2100-01-31,32828
diff --git a/tests/data/toy_omop/capital_letters/PERSON.csv b/tests/data/toy_omop/capital_letters/PERSON.csv
new file mode 100644
index 0000000..413bedf
--- /dev/null
+++ b/tests/data/toy_omop/capital_letters/PERSON.csv
@@ -0,0 +1,2 @@
+﻿PERSON_ID,GENDER_CONCEPT_ID,YEAR_OF_BIRTH,MONTH_OF_BIRTH,DAY_OF_BIRTH,BIRTH_DATETIME,RACE_CONCEPT_ID,ETHNICITY_CONCEPT_ID,LOCATION_ID,PROVIDER_ID,CARE_SITE_ID,PERSON_SOURCE_VALUE,GENDER_SOURCE_VALUE,GENDER_SOURCE_CONCEPT_ID,RACE_SOURCE_VALUE,RACE_SOURCE_CONCEPT_ID,ETHNICITY_SOURCE_VALUE,ETHNICITY_SOURCE_CONCEPT_ID
+1,8507,2095,,,,0,38003563,,,,1234,M,0,,,,
diff --git a/tests/test_io/test_omop.py b/tests/test_io/test_omop.py
index 68ed0fc..f6d4024 100644
--- a/tests/test_io/test_omop.py
+++ b/tests/test_io/test_omop.py
@@ -274,3 +274,30 @@ def test_setup_variables_illegal_argument_types(
             enrich_var_with_feature_info=enrich_var_with_feature_info,
             enrich_var_with_unit_info=enrich_var_with_unit_info,
         )
+
+
+def test_capital_letters(omop_connection_capital_letters):
+    # test capital letters both in table names and column names
+    con = omop_connection_capital_letters
+    edata = ed.io.omop.setup_obs(backend_handle=con, observation_table="person_observation_period")
+    edata = ed.io.omop.setup_variables(
+        edata,
+        backend_handle=con,
+        data_tables=["measurement"],
+        data_field_to_keep=["value_as_number"],
+        interval_length_number=1,
+        interval_length_unit="day",
+        num_intervals=1,
+        enrich_var_with_feature_info=False,
+        enrich_var_with_unit_info=False,
+    )
+
+    assert edata.r[0, 0, 0] == 18
+
+    tables = con.execute("SHOW TABLES").df()["name"].values
+    assert "measurement" in tables
+    assert "MEASUREMENT" not in tables
+
+    measurement_columns = con.execute("SELECT * FROM measurement").df().columns
+    assert "measurement_id" in measurement_columns
+    assert "MEASUREMENT_ID" not in measurement_columns

From 907c21fbc5119fa3f526dbf8ad6a05657a682ad1 Mon Sep 17 00:00:00 2001
From: Eljas Roellin <eljas.roellin@ikmail.com>
Date: Sun, 17 Nov 2024 19:00:15 +0100
Subject: [PATCH 22/43] remove load

---
 docs/api.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/api.md b/docs/api.md
index 2dc62aa..b209ba5 100644
--- a/docs/api.md
+++ b/docs/api.md
@@ -23,7 +23,6 @@
     io.omop.setup_obs
     io.omop.setup_variables
     io.omop.get_time_interval_table
-    io.omop.load
 ```
 
 ## Datasets

From c91e4014e40e7a817437dc88c4f7a360f98280f3 Mon Sep 17 00:00:00 2001
From: Eljas Roellin <eljas.roellin@ikmail.com>
Date: Sun, 17 Nov 2024 19:02:59 +0100
Subject: [PATCH 23/43] remove some things I forgot

---
 src/ehrdata/dt/datasets.py | 21 ++++-----------------
 1 file changed, 4 insertions(+), 17 deletions(-)

diff --git a/src/ehrdata/dt/datasets.py b/src/ehrdata/dt/datasets.py
index 152a3eb..94a6e2e 100644
--- a/src/ehrdata/dt/datasets.py
+++ b/src/ehrdata/dt/datasets.py
@@ -17,13 +17,6 @@
     from ehrdata import EHRData
 
 
-COLUMN_CASE = {
-    "uppercase": "uppercase",
-    "lowercase": "lowercase",
-    "titlecase": "titlecase",
-}
-
-
 def _setup_eunomia_datasets(
     data_url: str,
     backend_handle: DuckDBPyConnection,
@@ -41,12 +34,11 @@ def _setup_eunomia_datasets(
         for file_path in (data_path / nested_omop_tables_folder).glob("*.csv"):
             shutil.move(file_path, data_path)
 
-    edata = setup_connection(
+    setup_connection(
         data_path,
         backend_handle,
         prefix=dataset_prefix,
     )
-    return edata
 
 
 def mimic_iv_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = None) -> None:
@@ -81,14 +73,13 @@ def mimic_iv_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = N
     if data_path is None:
         data_path = Path("ehrapy_data/mimic-iv-demo-data-in-the-omop-common-data-model-0.9")
 
-    edata = _setup_eunomia_datasets(
+    _setup_eunomia_datasets(
         data_url=data_url,
         backend_handle=backend_handle,
         data_path=data_path,
         nested_omop_tables_folder="mimic-iv-demo-data-in-the-omop-common-data-model-0.9/1_omop_data_csv",
         dataset_prefix="2b_",
     )
-    return edata
 
 
 def gibleed_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = None) -> None:
@@ -122,15 +113,13 @@ def gibleed_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = No
     if data_path is None:
         data_path = Path("ehrapy_data/GiBleed_5.3")
 
-    edata = _setup_eunomia_datasets(
+    _setup_eunomia_datasets(
         data_url=data_url,
         backend_handle=backend_handle,
         data_path=data_path,
         nested_omop_tables_folder="GiBleed_5.3",
     )
 
-    return edata
-
 
 def synthea27nj_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = None) -> None:
     """Loads the Synthea27Nj dataset in the OMOP Common Data model.
@@ -163,14 +152,12 @@ def synthea27nj_omop(backend_handle: DuckDBPyConnection, data_path: Path | None
     if data_path is None:
         data_path = Path("ehrapy_data/Synthea27Nj_5.4")
 
-    edata = _setup_eunomia_datasets(
+    _setup_eunomia_datasets(
         data_url=data_url,
         backend_handle=backend_handle,
         data_path=data_path,
     )
 
-    return edata
-
 
 def mimic_ii(backend_handle: DuckDBPyConnection, data_path: Path | None = None) -> None:
     """Loads the MIMIC2 dataset."""

From a7c7af287e4a3de8ad7db675ff53b1573726ce05 Mon Sep 17 00:00:00 2001
From: Eljas Roellin <eljas.roellin@ikmail.com>
Date: Sun, 17 Nov 2024 19:21:06 +0100
Subject: [PATCH 24/43] move validity checks to separate file

---
 src/ehrdata/io/omop/_check_arguments.py |  94 ++++++++++++++++++++++
 src/ehrdata/io/omop/omop.py             | 102 ++++--------------------
 2 files changed, 111 insertions(+), 85 deletions(-)
 create mode 100644 src/ehrdata/io/omop/_check_arguments.py

diff --git a/src/ehrdata/io/omop/_check_arguments.py b/src/ehrdata/io/omop/_check_arguments.py
new file mode 100644
index 0000000..ca4d753
--- /dev/null
+++ b/src/ehrdata/io/omop/_check_arguments.py
@@ -0,0 +1,94 @@
+from __future__ import annotations
+
+from collections.abc import Sequence
+
+import duckdb
+
+from ehrdata.io.omop._queries import (
+    AGGREGATION_STRATEGY_KEY,
+)
+
+DOWNLOAD_VERIFICATION_TAG = "download_verification_tag"
+VALID_OBSERVATION_TABLES_SINGLE = ["person"]
+VALID_OBSERVATION_TABLES_JOIN = ["person_cohort", "person_observation_period", "person_visit_occurrence"]
+VALID_VARIABLE_TABLES = ["measurement", "observation", "specimen"]
+
+
+def _check_valid_backend_handle(backend_handle) -> None:
+    if not isinstance(backend_handle, duckdb.duckdb.DuckDBPyConnection):
+        raise TypeError("Expected backend_handle to be of type DuckDBPyConnection.")
+
+
+def _check_valid_observation_table(observation_table) -> None:
+    if not isinstance(observation_table, str):
+        raise TypeError("Expected observation_table to be a string.")
+    if observation_table not in VALID_OBSERVATION_TABLES_SINGLE + VALID_OBSERVATION_TABLES_JOIN:
+        raise ValueError(
+            f"observation_table must be one of {VALID_OBSERVATION_TABLES_SINGLE+VALID_OBSERVATION_TABLES_JOIN}."
+        )
+
+
+def _check_valid_death_table(death_table) -> None:
+    if not isinstance(death_table, bool):
+        raise TypeError("Expected death_table to be a boolean.")
+
+
+def _check_valid_edata(edata) -> None:
+    from ehrdata import EHRData
+
+    if not isinstance(edata, EHRData):
+        raise TypeError("Expected edata to be of type EHRData.")
+
+
+def _check_valid_data_tables(data_tables) -> Sequence:
+    if isinstance(data_tables, str):
+        data_tables = [data_tables]
+    if not isinstance(data_tables, Sequence):
+        raise TypeError("Expected data_tables to be a string or Sequence.")
+    if not all(table in VALID_VARIABLE_TABLES for table in data_tables):
+        raise ValueError(f"data_tables must be a subset of {VALID_VARIABLE_TABLES}.")
+    return data_tables
+
+
+def _check_valid_data_field_to_keep(data_field_to_keep) -> Sequence:
+    if isinstance(data_field_to_keep, str):
+        data_field_to_keep = [data_field_to_keep]
+    if not isinstance(data_field_to_keep, Sequence):
+        raise TypeError("Expected data_field_to_keep to be a string or Sequence.")
+    return data_field_to_keep
+
+
+def _check_valid_interval_length_number(interval_length_number) -> None:
+    if not isinstance(interval_length_number, int):
+        raise TypeError("Expected interval_length_number to be an integer.")
+
+
+def _check_valid_interval_length_unit(interval_length_unit) -> None:
+    # TODO: maybe check if it is a valid unit from pandas.to_timedelta
+    if not isinstance(interval_length_unit, str):
+        raise TypeError("Expected interval_length_unit to be a string.")
+
+
+def _check_valid_num_intervals(num_intervals) -> None:
+    if not isinstance(num_intervals, int):
+        raise TypeError("Expected num_intervals to be an integer.")
+
+
+def _check_valid_concept_ids(concept_ids) -> None:
+    if concept_ids != "all" and not isinstance(concept_ids, Sequence):
+        raise TypeError("concept_ids must be a sequence of integers or 'all'.")
+
+
+def _check_valid_aggregation_strategy(aggregation_strategy) -> None:
+    if aggregation_strategy not in AGGREGATION_STRATEGY_KEY.keys():
+        raise TypeError(f"aggregation_strategy must be one of {AGGREGATION_STRATEGY_KEY.keys()}.")
+
+
+def _check_valid_enrich_var_with_feature_info(enrich_var_with_feature_info) -> None:
+    if not isinstance(enrich_var_with_feature_info, bool):
+        raise TypeError("Expected enrich_var_with_feature_info to be a boolean.")
+
+
+def _check_valid_enrich_var_with_unit_info(enrich_var_with_unit_info) -> None:
+    if not isinstance(enrich_var_with_unit_info, bool):
+        raise TypeError("Expected enrich_var_with_unit_info to be a boolean.")
diff --git a/src/ehrdata/io/omop/omop.py b/src/ehrdata/io/omop/omop.py
index 3d6efb6..2c3d9df 100644
--- a/src/ehrdata/io/omop/omop.py
+++ b/src/ehrdata/io/omop/omop.py
@@ -12,18 +12,30 @@
 import pandas as pd
 from duckdb.duckdb import DuckDBPyConnection
 
+from ehrdata.io.omop._check_arguments import (
+    VALID_OBSERVATION_TABLES_JOIN,
+    VALID_OBSERVATION_TABLES_SINGLE,
+    _check_valid_aggregation_strategy,
+    _check_valid_backend_handle,
+    _check_valid_concept_ids,
+    _check_valid_data_field_to_keep,
+    _check_valid_data_tables,
+    _check_valid_death_table,
+    _check_valid_edata,
+    _check_valid_enrich_var_with_feature_info,
+    _check_valid_enrich_var_with_unit_info,
+    _check_valid_interval_length_number,
+    _check_valid_interval_length_unit,
+    _check_valid_num_intervals,
+    _check_valid_observation_table,
+)
 from ehrdata.io.omop._queries import (
-    AGGREGATION_STRATEGY_KEY,
     time_interval_table_query_long_format,
 )
 from ehrdata.utils._omop_utils import get_table_catalog_dict
 
 DOWNLOAD_VERIFICATION_TAG = "download_verification_tag"
 
-VALID_OBSERVATION_TABLES_SINGLE = ["person"]
-VALID_OBSERVATION_TABLES_JOIN = ["person_cohort", "person_observation_period", "person_visit_occurrence"]
-VALID_VARIABLE_TABLES = ["measurement", "observation", "specimen"]
-
 
 def _get_table_list() -> list:
     flat_table_list = []
@@ -84,86 +96,6 @@ def _set_up_duckdb(path: Path, backend_handle: DuckDBPyConnection, prefix: str =
     logging.info(f"unused files: {unused_files}")
 
 
-def _check_valid_backend_handle(backend_handle) -> None:
-    if not isinstance(backend_handle, duckdb.duckdb.DuckDBPyConnection):
-        raise TypeError("Expected backend_handle to be of type DuckDBPyConnection.")
-
-
-def _check_valid_observation_table(observation_table) -> None:
-    if not isinstance(observation_table, str):
-        raise TypeError("Expected observation_table to be a string.")
-    if observation_table not in VALID_OBSERVATION_TABLES_SINGLE + VALID_OBSERVATION_TABLES_JOIN:
-        raise ValueError(
-            f"observation_table must be one of {VALID_OBSERVATION_TABLES_SINGLE+VALID_OBSERVATION_TABLES_JOIN}."
-        )
-
-
-def _check_valid_death_table(death_table) -> None:
-    if not isinstance(death_table, bool):
-        raise TypeError("Expected death_table to be a boolean.")
-
-
-def _check_valid_edata(edata) -> None:
-    from ehrdata import EHRData
-
-    if not isinstance(edata, EHRData):
-        raise TypeError("Expected edata to be of type EHRData.")
-
-
-def _check_valid_data_tables(data_tables) -> Sequence:
-    if isinstance(data_tables, str):
-        data_tables = [data_tables]
-    if not isinstance(data_tables, Sequence):
-        raise TypeError("Expected data_tables to be a string or Sequence.")
-    if not all(table in VALID_VARIABLE_TABLES for table in data_tables):
-        raise ValueError(f"data_tables must be a subset of {VALID_VARIABLE_TABLES}.")
-    return data_tables
-
-
-def _check_valid_data_field_to_keep(data_field_to_keep) -> Sequence:
-    if isinstance(data_field_to_keep, str):
-        data_field_to_keep = [data_field_to_keep]
-    if not isinstance(data_field_to_keep, Sequence):
-        raise TypeError("Expected data_field_to_keep to be a string or Sequence.")
-    return data_field_to_keep
-
-
-def _check_valid_interval_length_number(interval_length_number) -> None:
-    if not isinstance(interval_length_number, int):
-        raise TypeError("Expected interval_length_number to be an integer.")
-
-
-def _check_valid_interval_length_unit(interval_length_unit) -> None:
-    # TODO: maybe check if it is a valid unit from pandas.to_timedelta
-    if not isinstance(interval_length_unit, str):
-        raise TypeError("Expected interval_length_unit to be a string.")
-
-
-def _check_valid_num_intervals(num_intervals) -> None:
-    if not isinstance(num_intervals, int):
-        raise TypeError("Expected num_intervals to be an integer.")
-
-
-def _check_valid_concept_ids(concept_ids) -> None:
-    if concept_ids != "all" and not isinstance(concept_ids, Sequence):
-        raise TypeError("concept_ids must be a sequence of integers or 'all'.")
-
-
-def _check_valid_aggregation_strategy(aggregation_strategy) -> None:
-    if aggregation_strategy not in AGGREGATION_STRATEGY_KEY.keys():
-        raise TypeError(f"aggregation_strategy must be one of {AGGREGATION_STRATEGY_KEY.keys()}.")
-
-
-def _check_valid_enrich_var_with_feature_info(enrich_var_with_feature_info) -> None:
-    if not isinstance(enrich_var_with_feature_info, bool):
-        raise TypeError("Expected enrich_var_with_feature_info to be a boolean.")
-
-
-def _check_valid_enrich_var_with_unit_info(enrich_var_with_unit_info) -> None:
-    if not isinstance(enrich_var_with_unit_info, bool):
-        raise TypeError("Expected enrich_var_with_unit_info to be a boolean.")
-
-
 def _collect_units_per_feature(ds, unit_key="unit_concept_id") -> dict:
     feature_units = {}
     for i in range(ds[unit_key].shape[1]):

From 09b0c34a9cb259ff8c2fc6e7935dd7800998cd22 Mon Sep 17 00:00:00 2001
From: Eljas Roellin <eljas.roellin@ikmail.com>
Date: Sun, 17 Nov 2024 21:54:48 +0100
Subject: [PATCH 25/43] enable and vanilla test for specimen table

---
 tests/data/toy_omop/vanilla/specimen.csv | 10 ++++++++++
 tests/test_io/test_omop.py               | 13 +++++++------
 2 files changed, 17 insertions(+), 6 deletions(-)
 create mode 100644 tests/data/toy_omop/vanilla/specimen.csv

diff --git a/tests/data/toy_omop/vanilla/specimen.csv b/tests/data/toy_omop/vanilla/specimen.csv
new file mode 100644
index 0000000..ada93dd
--- /dev/null
+++ b/tests/data/toy_omop/vanilla/specimen.csv
@@ -0,0 +1,10 @@
+specimen_id,person_id,specimen_concept_id,specimen_type_concept_id,specimen_date,specimen_datetime,quantity,unit_concept_id,anatomic_site_concept_id,disease_status_concept_id,specimen_source_id,specimen_source_value,unit_source_value,anatomic_site_source_value,disease_status_source_value
+1,1,4001225,32856,2100-01-01,2100-01-01 12:00:00,0.5,,0,0,,70012,,,
+2,1,4001225,32856,2100-01-01,2100-01-01 13:00:00,0.5,,0,0,,70012,,,
+3,1,4121345,32856,2100-01-01,2100-01-01 14:00:00,1.5,,0,0,,70021,,,
+4,2,4001225,32856,2100-01-01,2100-01-01 12:00:00,0.5,,0,0,,70012,,,
+5,2,4001225,32856,2100-01-01,2100-01-01 13:00:00,0.5,,0,0,,70012,,,
+6,2,4121345,32856,2100-01-01,2100-01-01 14:00:00,1.5,,0,0,,70021,,,
+7,3,4001225,32856,2100-01-01,2100-01-01 12:00:00,0.5,,0,0,,70012,,,
+8,3,4001225,32856,2100-01-01,2100-01-01 13:00:00,0.5,,0,0,,70012,,,
+9,3,4121345,32856,2100-01-01,2100-01-01 14:00:00,1.5,,0,0,,70021,,,
diff --git a/tests/test_io/test_omop.py b/tests/test_io/test_omop.py
index f6d4024..81ae099 100644
--- a/tests/test_io/test_omop.py
+++ b/tests/test_io/test_omop.py
@@ -13,6 +13,7 @@
 VANILLA_NUM_CONCEPTS = {
     "measurement": 2,
     "observation": 2,
+    "specimen": 2,
 }
 
 # constants for setup_variables
@@ -88,12 +89,12 @@ def test_setup_obs_invalid_observation_table_value(omop_connection_vanilla):
     ["person_cohort", "person_observation_period", "person_visit_occurrence"],
 )
 @pytest.mark.parametrize(
-    "data_tables",
-    [["measurement"], ["observation"]],
-)
-@pytest.mark.parametrize(
-    "data_field_to_keep",
-    [["value_as_number"], ["value_as_concept_id"]],
+    "data_tables,data_field_to_keep",
+    [
+        (["measurement"], ["value_as_number", "value_as_concept_id"]),
+        (["observation"], ["value_as_number", "value_as_concept_id"]),
+        (["specimen"], ["quantity"]),
+    ],
 )
 @pytest.mark.parametrize(
     "enrich_var_with_feature_info",

From 6854159718504a11637e87a8e2031103067e66de Mon Sep 17 00:00:00 2001
From: Eljas Roellin <eljas.roellin@ikmail.com>
Date: Mon, 18 Nov 2024 12:15:20 +0100
Subject: [PATCH 26/43] escape the % in duckdb's read_csv

---
 src/ehrdata/io/omop/omop.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ehrdata/io/omop/omop.py b/src/ehrdata/io/omop/omop.py
index 2c3d9df..efd5bdd 100644
--- a/src/ehrdata/io/omop/omop.py
+++ b/src/ehrdata/io/omop/omop.py
@@ -65,7 +65,7 @@ def _set_up_duckdb(path: Path, backend_handle: DuckDBPyConnection, prefix: str =
                 dtype = None
 
             # read raw csv as temporary table
-            temp_relation = backend_handle.read_csv(path / file_name, dtype=dtype)  # noqa: F841
+            temp_relation = backend_handle.read_csv(path / file_name, dtype=dtype, escapechar="%")  # noqa: F841
             backend_handle.execute("CREATE OR REPLACE TABLE temp_table AS SELECT * FROM temp_relation")
 
             # make query to create table with lowercase column names

From fcd641b3aeec3b52e3132a2c938435dd8f8b35d2 Mon Sep 17 00:00:00 2001
From: Eljas Roellin <eljas.roellin@ikmail.com>
Date: Mon, 18 Nov 2024 15:38:42 +0100
Subject: [PATCH 27/43] add robustness: empty observation,measurement,speciment
 acceptable

---
 src/ehrdata/io/omop/omop.py | 152 ++++++++++++++++++++++++++++++++++++
 tests/conftest.py           |   8 ++
 tests/test_io/test_omop.py  |  18 +++++
 3 files changed, 178 insertions(+)

diff --git a/src/ehrdata/io/omop/omop.py b/src/ehrdata/io/omop/omop.py
index efd5bdd..344fdac 100644
--- a/src/ehrdata/io/omop/omop.py
+++ b/src/ehrdata/io/omop/omop.py
@@ -325,6 +325,158 @@ def setup_variables(
         else:
             raise ValueError
 
+    # dbms complains about our queries, which sometimes need a column to be of type e.g. datetime, when it can't infer types from data
+    count = backend_handle.execute(f"SELECT COUNT(*) as count FROM {data_tables[0]}").df()["count"].item()
+    if count == 0:
+        logging.info(f"No data found in {data_tables[0]}. Returning edata without additional variables.")
+        return edata
+
+    ds = (
+        time_interval_table_query_long_format(
+            backend_handle=backend_handle,
+            time_defining_table=time_defining_table,
+            data_table=data_tables[0],
+            data_field_to_keep=data_field_to_keep,
+            interval_length_number=interval_length_number,
+            interval_length_unit=interval_length_unit,
+            num_intervals=num_intervals,
+            aggregation_strategy=aggregation_strategy,
+        )
+        .set_index(["person_id", "data_table_concept_id", "interval_step"])
+        .to_xarray()
+    )
+
+    _check_one_unit_per_feature(ds)
+    # TODO ignore? go with more vanilla omop style. _check_one_unit_per_feature(ds, unit_key="unit_source_value")
+
+    unit_report = _create_feature_unit_concept_id_report(backend_handle, ds)
+
+    var = ds["data_table_concept_id"].to_dataframe()
+
+    if enrich_var_with_feature_info or enrich_var_with_unit_info:
+        concepts = backend_handle.sql("SELECT * FROM concept").df()
+        concepts.columns = concepts.columns.str.lower()
+
+    if enrich_var_with_feature_info:
+        var = pd.merge(var, concepts, how="left", left_index=True, right_on="concept_id")
+
+    if enrich_var_with_unit_info:
+        if unit_report["multiple_units"].sum() > 0:
+            raise ValueError("Multiple units per feature found. Enrichment with feature information not possible.")
+        else:
+            var = pd.merge(
+                var,
+                unit_report,
+                how="left",
+                left_index=True,
+                right_on="unit_concept_id",
+                suffixes=("", "_unit"),
+            )
+            var = pd.merge(
+                var,
+                concepts,
+                how="left",
+                left_on="unit_concept_id",
+                right_on="concept_id",
+                suffixes=("", "_unit"),
+            )
+
+    t = ds["interval_step"].to_dataframe()
+
+    edata = EHRData(r=ds[data_field_to_keep[0]].values, obs=edata.obs, var=var, uns=edata.uns, t=t)
+    edata.uns[f"unit_report_{data_tables[0]}"] = unit_report
+
+    return edata
+
+
+def setup_interval_variables(
+    edata,
+    *,
+    backend_handle: duckdb.duckdb.DuckDBPyConnection,
+    data_tables: Sequence[Literal["drug_exposure"]] | Literal["drug_exposure"],
+    data_field_to_keep: str | Sequence[str],
+    interval_length_number: int,
+    interval_length_unit: str,
+    num_intervals: int,
+    concept_ids: Literal["all"] | Sequence = "all",
+    aggregation_strategy: str = "last",
+    enrich_var_with_feature_info: bool = False,
+    enrich_var_with_unit_info: bool = False,
+    keep_start_date_only: bool = False,
+):
+    """Setup the interval variables
+
+    This function sets up the variables that are stored as interval in OMOP for the EHRData object.
+    It will fail if there is more than one unit_concept_id per feature.
+    Writes a unit report of the features to edata.uns["unit_report_<data_tables>"].
+
+    Parameters
+    ----------
+    backend_handle
+        The backend handle to the database.
+    edata
+        The EHRData object to which the variables should be added.
+    data_tables
+        The table to be used. Only a single table can be used.
+    data_field_to_keep
+        The CDM Field in the data table to be kept. Can be e.g. "value_as_number" or "value_as_concept_id".
+    start_time
+        Starting time for values to be included.
+    interval_length_number
+        Numeric value of the length of one interval.
+    interval_length_unit
+        Unit belonging to the interval length.
+    num_intervals
+        Number of intervals.
+    concept_ids
+        Concept IDs to use from this data table. If not specified, 'all' are used.
+    aggregation_strategy
+        Strategy to use when aggregating multiple data points within one interval.
+    enrich_var_with_feature_info
+        Whether to enrich the var table with feature information. If a concept_id is not found in the concept table, the feature information will be NaN.
+    enrich_var_with_unit_info
+        Whether to enrich the var table with unit information. Raises an Error if a) multiple units per feature are found for at least one feature. If a concept_id is not found in the concept table, the feature information will be NaN.
+
+    Returns
+    -------
+    An EHRData object with populated .r and .var field.
+    """
+    from ehrdata import EHRData
+
+    _check_valid_edata(edata)
+    _check_valid_backend_handle(backend_handle)
+    data_tables = _check_valid_data_tables(data_tables)
+    data_field_to_keep = _check_valid_data_field_to_keep(data_field_to_keep)
+    _check_valid_interval_length_number(interval_length_number)
+    _check_valid_interval_length_unit(interval_length_unit)
+    _check_valid_num_intervals(num_intervals)
+    _check_valid_concept_ids(concept_ids)
+    _check_valid_aggregation_strategy(aggregation_strategy)
+    _check_valid_enrich_var_with_feature_info(enrich_var_with_feature_info)
+    _check_valid_enrich_var_with_unit_info(enrich_var_with_unit_info)
+
+    time_defining_table = edata.uns.get("omop_io_observation_table", None)
+    if time_defining_table is None:
+        raise ValueError("The observation table must be set up first, use the `setup_obs` function.")
+
+    if data_tables[0] in ["drug_exposure"]:
+        # also keep unit_concept_id and unit_source_value;
+        if isinstance(data_field_to_keep, list):
+            data_field_to_keep = list(data_field_to_keep) + ["unit_concept_id", "unit_source_value"]
+        # TODO: use in future version when more than one data table can be used
+        # elif isinstance(data_field_to_keep, dict):
+        #     data_field_to_keep = {
+        #         k: v + ["unit_concept_id", "unit_source_value"] for k, v in data_field_to_keep.items()
+        #     }
+        else:
+            raise ValueError
+
+    # dbms complains about our queries, which sometimes need a column to be of type e.g. datetime, when it can't infer types from data
+    count = backend_handle.execute(f"SELECT COUNT(*) as count FROM {data_tables}").df()["count"].item()
+    if count == 0:
+        logging.info(f"No data in {data_tables}.")
+        return edata
+
     ds = (
         time_interval_table_query_long_format(
             backend_handle=backend_handle,
diff --git a/tests/conftest.py b/tests/conftest.py
index baf5e94..a42fcb1 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -18,3 +18,11 @@ def omop_connection_capital_letters():
     setup_connection(path="tests/data/toy_omop/capital_letters", backend_handle=con)
     yield con
     con.close()
+
+
+@pytest.fixture  # (scope="session")
+def omop_connection_empty_observation():
+    con = duckdb.connect()
+    setup_connection(path="tests/data/toy_omop/empty_observation", backend_handle=con)
+    yield con
+    con.close()
diff --git a/tests/test_io/test_omop.py b/tests/test_io/test_omop.py
index 81ae099..01c995a 100644
--- a/tests/test_io/test_omop.py
+++ b/tests/test_io/test_omop.py
@@ -302,3 +302,21 @@ def test_capital_letters(omop_connection_capital_letters):
     measurement_columns = con.execute("SELECT * FROM measurement").df().columns
     assert "measurement_id" in measurement_columns
     assert "MEASUREMENT_ID" not in measurement_columns
+
+
+def test_empty_observation(omop_connection_empty_observation, caplog):
+    con = omop_connection_empty_observation
+    edata = ed.io.omop.setup_obs(backend_handle=con, observation_table="person")
+    edata = ed.io.omop.setup_variables(
+        edata,
+        backend_handle=con,
+        data_tables=["observation"],
+        data_field_to_keep=["value_as_number"],
+        interval_length_number=1,
+        interval_length_unit="day",
+        num_intervals=1,
+        enrich_var_with_feature_info=False,
+        enrich_var_with_unit_info=False,
+    )
+    assert edata.shape == (1, 0)
+    assert "No data found in observation. Returning edata without additional variables." in caplog.text

From a390b4cb06cba43cf0bced6f897b3ce636bd5e7a Mon Sep 17 00:00:00 2001
From: Eljas Roellin <eljas.roellin@ikmail.com>
Date: Mon, 18 Nov 2024 16:16:47 +0100
Subject: [PATCH 28/43] add empty_observation test data

---
 tests/data/toy_omop/empty_observation/observation.csv | 1 +
 tests/data/toy_omop/empty_observation/person.csv      | 2 ++
 2 files changed, 3 insertions(+)
 create mode 100644 tests/data/toy_omop/empty_observation/observation.csv
 create mode 100644 tests/data/toy_omop/empty_observation/person.csv

diff --git a/tests/data/toy_omop/empty_observation/observation.csv b/tests/data/toy_omop/empty_observation/observation.csv
new file mode 100644
index 0000000..ad1a438
--- /dev/null
+++ b/tests/data/toy_omop/empty_observation/observation.csv
@@ -0,0 +1 @@
+observation_id,person_id,observation_concept_id,observation_date,observation_datetime,observation_type_concept_id,value_as_number,value_as_string,value_as_concept_id,qualifier_concept_id,unit_concept_id,provider_id,visit_occurrence_id,visit_detail_id,observation_source_value,observation_source_concept_id,unit_source_value,qualifier_source_value
diff --git a/tests/data/toy_omop/empty_observation/person.csv b/tests/data/toy_omop/empty_observation/person.csv
new file mode 100644
index 0000000..0f13db9
--- /dev/null
+++ b/tests/data/toy_omop/empty_observation/person.csv
@@ -0,0 +1,2 @@
+﻿person_id,gender_concept_id,year_of_birth,month_of_birth,day_of_birth,birth_datetime,race_concept_id,ethnicity_concept_id,location_id,provider_id,care_site_id,person_source_value,gender_source_value,gender_source_concept_id,race_source_value,race_source_concept_id,ethnicity_source_value,ethnicity_source_concept_id
+1,8507,2095,,,,0,38003563,,,,1234,M,0,,,,

From adcf18676d5e5bda87583188ba36f1525f843c36 Mon Sep 17 00:00:00 2001
From: Eljas Roellin <eljas.roellin@ikmail.com>
Date: Mon, 18 Nov 2024 17:50:52 +0100
Subject: [PATCH 29/43] add support for start or end of e.g. drug_exposure

---
 src/ehrdata/io/omop/__init__.py               |  1 +
 src/ehrdata/io/omop/_check_arguments.py       | 21 ++++++-
 src/ehrdata/io/omop/_queries.py               | 17 ++++-
 src/ehrdata/io/omop/omop.py                   | 63 +++++--------------
 tests/data/toy_omop/vanilla/drug_exposure.csv | 10 +++
 tests/test_io/test_omop.py                    | 51 +++++++++++++++
 6 files changed, 112 insertions(+), 51 deletions(-)
 create mode 100644 tests/data/toy_omop/vanilla/drug_exposure.csv

diff --git a/src/ehrdata/io/omop/__init__.py b/src/ehrdata/io/omop/__init__.py
index 6fb9860..d6be480 100644
--- a/src/ehrdata/io/omop/__init__.py
+++ b/src/ehrdata/io/omop/__init__.py
@@ -13,6 +13,7 @@
     # extract_procedure_occurrence,
     # extract_specimen,
     setup_connection,
+    setup_interval_variables,
     setup_obs,
     setup_variables,
 )
diff --git a/src/ehrdata/io/omop/_check_arguments.py b/src/ehrdata/io/omop/_check_arguments.py
index ca4d753..8b145cf 100644
--- a/src/ehrdata/io/omop/_check_arguments.py
+++ b/src/ehrdata/io/omop/_check_arguments.py
@@ -12,6 +12,8 @@
 VALID_OBSERVATION_TABLES_SINGLE = ["person"]
 VALID_OBSERVATION_TABLES_JOIN = ["person_cohort", "person_observation_period", "person_visit_occurrence"]
 VALID_VARIABLE_TABLES = ["measurement", "observation", "specimen"]
+VALID_INTERVAL_VARIABLE_TABLES = ["drug_exposure"]
+VALID_KEEP_DATES = ["start", "end", "interval"]
 
 
 def _check_valid_backend_handle(backend_handle) -> None:
@@ -40,7 +42,7 @@ def _check_valid_edata(edata) -> None:
         raise TypeError("Expected edata to be of type EHRData.")
 
 
-def _check_valid_data_tables(data_tables) -> Sequence:
+def _check_valid_variable_data_tables(data_tables) -> Sequence:
     if isinstance(data_tables, str):
         data_tables = [data_tables]
     if not isinstance(data_tables, Sequence):
@@ -50,6 +52,16 @@ def _check_valid_data_tables(data_tables) -> Sequence:
     return data_tables
 
 
+def _check_valid_interval_variable_data_tables(data_tables) -> Sequence:
+    if isinstance(data_tables, str):
+        data_tables = [data_tables]
+    if not isinstance(data_tables, Sequence):
+        raise TypeError("Expected data_tables to be a string or Sequence.")
+    if not all(table in VALID_INTERVAL_VARIABLE_TABLES for table in data_tables):
+        raise ValueError(f"data_tables must be a subset of {VALID_INTERVAL_VARIABLE_TABLES}.")
+    return data_tables
+
+
 def _check_valid_data_field_to_keep(data_field_to_keep) -> Sequence:
     if isinstance(data_field_to_keep, str):
         data_field_to_keep = [data_field_to_keep]
@@ -92,3 +104,10 @@ def _check_valid_enrich_var_with_feature_info(enrich_var_with_feature_info) -> N
 def _check_valid_enrich_var_with_unit_info(enrich_var_with_unit_info) -> None:
     if not isinstance(enrich_var_with_unit_info, bool):
         raise TypeError("Expected enrich_var_with_unit_info to be a boolean.")
+
+
+def _check_valid_keep_date(keep_date: str) -> None:
+    if not isinstance(keep_date, str):
+        raise TypeError("Expected keep_date to be a string.")
+    if keep_date not in VALID_KEEP_DATES:
+        raise ValueError(f"keep_date must be one of {VALID_KEEP_DATES}.")
diff --git a/src/ehrdata/io/omop/_queries.py b/src/ehrdata/io/omop/_queries.py
index abdbf80..2975231 100644
--- a/src/ehrdata/io/omop/_queries.py
+++ b/src/ehrdata/io/omop/_queries.py
@@ -19,6 +19,13 @@
     "cohort": "subject_id",
 }
 
+DATA_TABLE_CONCEPT_ID_TRUNK = {
+    "measurement": "measurement",
+    "observation": "observation",
+    "specimen": "specimen",
+    "drug_exposure": "drug",
+}
+
 AGGREGATION_STRATEGY_KEY = {
     "last": "LAST",
     "first": "FIRST",
@@ -83,11 +90,15 @@ def time_interval_table_query_long_format(
     num_intervals: int,
     aggregation_strategy: str,
     data_field_to_keep: Sequence[str] | str,
+    date_prefix: str = "",
 ) -> pd.DataFrame:
     """Returns a long format DataFrame from the data_table. The following columns should be considered the indices of this long format: person_id, data_table_concept_id, interval_step. The other columns, except for start_date and end_date, should be considered the values."""
     if isinstance(data_field_to_keep, str):
         data_field_to_keep = [data_field_to_keep]
 
+    if date_prefix != "":
+        date_prefix = date_prefix + "_"
+
     timedeltas_dataframe = _generate_timedeltas(interval_length_number, interval_length_unit, num_intervals)
 
     _write_timedeltas_to_db(
@@ -110,10 +121,10 @@ def time_interval_table_query_long_format(
         ), \
         person_data_table AS( \
             WITH distinct_data_table_concept_ids AS ( \
-                SELECT DISTINCT {data_table}_concept_id
+                SELECT DISTINCT {DATA_TABLE_CONCEPT_ID_TRUNK[data_table]}_concept_id
                 FROM {data_table} \
             )
-            SELECT person.person_id, {data_table}_concept_id as data_table_concept_id \
+            SELECT person.person_id, {DATA_TABLE_CONCEPT_ID_TRUNK[data_table]}_concept_id as data_table_concept_id \
             FROM person \
             CROSS JOIN distinct_data_table_concept_ids \
         ), \
@@ -129,7 +140,7 @@ def time_interval_table_query_long_format(
         ) \
         SELECT lfi.person_id, lfi.data_table_concept_id, interval_step, interval_start, interval_end, {_generate_value_query(data_table, data_field_to_keep, AGGREGATION_STRATEGY_KEY[aggregation_strategy])} \
         FROM long_format_intervals as lfi \
-        LEFT JOIN {data_table} ON lfi.person_id = {data_table}.person_id AND lfi.data_table_concept_id = {data_table}.{data_table}_concept_id AND {data_table}.{data_table}_date BETWEEN lfi.interval_start AND lfi.interval_end \
+        LEFT JOIN {data_table} ON lfi.person_id = {data_table}.person_id AND lfi.data_table_concept_id = {data_table}.{DATA_TABLE_CONCEPT_ID_TRUNK[data_table]}_concept_id AND {data_table}.{data_table}_{date_prefix}date BETWEEN lfi.interval_start AND lfi.interval_end \
         GROUP BY lfi.person_id, lfi.data_table_concept_id, interval_step, interval_start, interval_end
         """
     ).df()
diff --git a/src/ehrdata/io/omop/omop.py b/src/ehrdata/io/omop/omop.py
index 344fdac..876b9d2 100644
--- a/src/ehrdata/io/omop/omop.py
+++ b/src/ehrdata/io/omop/omop.py
@@ -19,15 +19,17 @@
     _check_valid_backend_handle,
     _check_valid_concept_ids,
     _check_valid_data_field_to_keep,
-    _check_valid_data_tables,
     _check_valid_death_table,
     _check_valid_edata,
     _check_valid_enrich_var_with_feature_info,
     _check_valid_enrich_var_with_unit_info,
     _check_valid_interval_length_number,
     _check_valid_interval_length_unit,
+    _check_valid_interval_variable_data_tables,
+    _check_valid_keep_date,
     _check_valid_num_intervals,
     _check_valid_observation_table,
+    _check_valid_variable_data_tables,
 )
 from ehrdata.io.omop._queries import (
     time_interval_table_query_long_format,
@@ -299,7 +301,7 @@ def setup_variables(
 
     _check_valid_edata(edata)
     _check_valid_backend_handle(backend_handle)
-    data_tables = _check_valid_data_tables(data_tables)
+    data_tables = _check_valid_variable_data_tables(data_tables)
     data_field_to_keep = _check_valid_data_field_to_keep(data_field_to_keep)
     _check_valid_interval_length_number(interval_length_number)
     _check_valid_interval_length_unit(interval_length_unit)
@@ -394,7 +396,7 @@ def setup_interval_variables(
     *,
     backend_handle: duckdb.duckdb.DuckDBPyConnection,
     data_tables: Sequence[Literal["drug_exposure"]] | Literal["drug_exposure"],
-    data_field_to_keep: str | Sequence[str],
+    data_field_to_keep: str | Sequence[str] | Literal["one-hot"],
     interval_length_number: int,
     interval_length_unit: str,
     num_intervals: int,
@@ -402,7 +404,7 @@ def setup_interval_variables(
     aggregation_strategy: str = "last",
     enrich_var_with_feature_info: bool = False,
     enrich_var_with_unit_info: bool = False,
-    keep_start_date_only: bool = False,
+    keep_date: Literal["start", "end", "interval"] = "start",
 ):
     """Setup the interval variables
 
@@ -434,8 +436,8 @@ def setup_interval_variables(
         Strategy to use when aggregating multiple data points within one interval.
     enrich_var_with_feature_info
         Whether to enrich the var table with feature information. If a concept_id is not found in the concept table, the feature information will be NaN.
-    enrich_var_with_unit_info
-        Whether to enrich the var table with unit information. Raises an Error if a) multiple units per feature are found for at least one feature. If a concept_id is not found in the concept table, the feature information will be NaN.
+    keep_date
+        Whether to keep the start or end date, or the interval span.
 
     Returns
     -------
@@ -445,7 +447,7 @@ def setup_interval_variables(
 
     _check_valid_edata(edata)
     _check_valid_backend_handle(backend_handle)
-    data_tables = _check_valid_data_tables(data_tables)
+    data_tables = _check_valid_interval_variable_data_tables(data_tables)
     data_field_to_keep = _check_valid_data_field_to_keep(data_field_to_keep)
     _check_valid_interval_length_number(interval_length_number)
     _check_valid_interval_length_unit(interval_length_unit)
@@ -454,29 +456,22 @@ def setup_interval_variables(
     _check_valid_aggregation_strategy(aggregation_strategy)
     _check_valid_enrich_var_with_feature_info(enrich_var_with_feature_info)
     _check_valid_enrich_var_with_unit_info(enrich_var_with_unit_info)
+    _check_valid_keep_date(keep_date)
 
     time_defining_table = edata.uns.get("omop_io_observation_table", None)
     if time_defining_table is None:
         raise ValueError("The observation table must be set up first, use the `setup_obs` function.")
 
-    if data_tables[0] in ["drug_exposure"]:
-        # also keep unit_concept_id and unit_source_value;
-        if isinstance(data_field_to_keep, list):
-            data_field_to_keep = list(data_field_to_keep) + ["unit_concept_id", "unit_source_value"]
-        # TODO: use in future version when more than one data table can be used
-        # elif isinstance(data_field_to_keep, dict):
-        #     data_field_to_keep = {
-        #         k: v + ["unit_concept_id", "unit_source_value"] for k, v in data_field_to_keep.items()
-        #     }
-        else:
-            raise ValueError
-
     # dbms complains about our queries, which sometimes need a column to be of type e.g. datetime, when it can't infer types from data
-    count = backend_handle.execute(f"SELECT COUNT(*) as count FROM {data_tables}").df()["count"].item()
+    count = backend_handle.execute(f"SELECT COUNT(*) as count FROM {data_tables[0]}").df()["count"].item()
     if count == 0:
         logging.info(f"No data in {data_tables}.")
         return edata
 
+    if keep_date == "start" or keep_date == "end":
+        date_prefix = keep_date
+    else:
+        raise NotImplementedError("support interval extraction coming soon")
     ds = (
         time_interval_table_query_long_format(
             backend_handle=backend_handle,
@@ -487,16 +482,12 @@ def setup_interval_variables(
             interval_length_unit=interval_length_unit,
             num_intervals=num_intervals,
             aggregation_strategy=aggregation_strategy,
+            date_prefix=date_prefix,
         )
         .set_index(["person_id", "data_table_concept_id", "interval_step"])
         .to_xarray()
     )
 
-    _check_one_unit_per_feature(ds)
-    # TODO ignore? go with more vanilla omop style. _check_one_unit_per_feature(ds, unit_key="unit_source_value")
-
-    unit_report = _create_feature_unit_concept_id_report(backend_handle, ds)
-
     var = ds["data_table_concept_id"].to_dataframe()
 
     if enrich_var_with_feature_info or enrich_var_with_unit_info:
@@ -506,31 +497,9 @@ def setup_interval_variables(
     if enrich_var_with_feature_info:
         var = pd.merge(var, concepts, how="left", left_index=True, right_on="concept_id")
 
-    if enrich_var_with_unit_info:
-        if unit_report["multiple_units"].sum() > 0:
-            raise ValueError("Multiple units per feature found. Enrichment with feature information not possible.")
-        else:
-            var = pd.merge(
-                var,
-                unit_report,
-                how="left",
-                left_index=True,
-                right_on="unit_concept_id",
-                suffixes=("", "_unit"),
-            )
-            var = pd.merge(
-                var,
-                concepts,
-                how="left",
-                left_on="unit_concept_id",
-                right_on="concept_id",
-                suffixes=("", "_unit"),
-            )
-
     t = ds["interval_step"].to_dataframe()
 
     edata = EHRData(r=ds[data_field_to_keep[0]].values, obs=edata.obs, var=var, uns=edata.uns, t=t)
-    edata.uns[f"unit_report_{data_tables[0]}"] = unit_report
 
     return edata
 
diff --git a/tests/data/toy_omop/vanilla/drug_exposure.csv b/tests/data/toy_omop/vanilla/drug_exposure.csv
new file mode 100644
index 0000000..b8d81dc
--- /dev/null
+++ b/tests/data/toy_omop/vanilla/drug_exposure.csv
@@ -0,0 +1,10 @@
+drug_exposure_id,person_id,drug_concept_id,drug_exposure_start_date,drug_exposure_start_datetime,drug_exposure_end_date,drug_exposure_end_datetime,verbatim_end_date,drug_type_concept_id,stop_reason,refills,quantity,days_supply,sig,route_concept_id,lot_number,provider_id,visit_occurrence_id,visit_detail_id,drug_source_value,drug_source_concept_id,route_source_value,dose_unit_source_value
+1,1,19073183,2100-01-01,2100-01-01 12:00:00,2100-01-31,2100-01-31 00:00:00,2100-01-31,32869,,0,0,31,,0,0,10,1,,308182,19073183,,
+2,1,19073183,2100-02-01,2100-02-01 12:00:00,2100-02-28,2100-02-28 00:00:00,2100-02-28,32869,,0,0,28,,0,0,10,1,,308182,19073183,,
+3,1,19019979,2100-01-01,2100-01-01 12:00:00,2100-01-31,2100-01-31 00:00:00,2100-01-31,32869,,0,0,31,,0,0,10,1,,198405,19019979,,
+4,2,19073183,2100-01-01,2100-01-01 12:00:00,2100-01-31,2100-01-31 00:00:00,2100-01-31,32869,,0,0,31,,0,0,10,2,,308182,19073183,,
+5,2,19073183,2100-02-01,2100-02-01 12:00:00,2100-02-28,2100-02-28 00:00:00,2100-02-28,32869,,0,0,28,,0,0,10,2,,308182,19073183,,
+6,2,19019979,2100-01-01,2100-01-01 12:00:00,2100-01-31,2100-01-31 00:00:00,2100-01-31,32869,,0,0,31,,0,0,10,1,,198405,19019979,,
+7,3,19073183,2100-01-01,2100-01-01 12:00:00,2100-01-31,2100-01-31 00:00:00,2100-01-31,32869,,0,0,31,,0,0,10,3,,308182,19073183,,
+8,3,19073183,2100-02-01,2100-02-01 12:00:00,2100-02-28,2100-02-28 00:00:00,2100-02-28,32869,,0,0,28,,0,0,10,3,,308182,19073183,,
+9,3,19019979,2100-01-01,2100-01-01 12:00:00,2100-01-31,2100-01-31 00:00:00,2100-01-31,32869,,0,0,31,,0,0,10,1,,198405,19019979,,
diff --git a/tests/test_io/test_omop.py b/tests/test_io/test_omop.py
index 01c995a..41a99ba 100644
--- a/tests/test_io/test_omop.py
+++ b/tests/test_io/test_omop.py
@@ -14,6 +14,7 @@
     "measurement": 2,
     "observation": 2,
     "specimen": 2,
+    "drug_exposure": 2,
 }
 
 # constants for setup_variables
@@ -136,6 +137,56 @@ def test_setup_variables(
     )
 
 
+@pytest.mark.parametrize(
+    "observation_table",
+    [
+        "person_cohort",
+    ],  # "person_observation_period", "person_visit_occurrence"],
+)
+@pytest.mark.parametrize(
+    "data_tables,data_field_to_keep",
+    [
+        (["drug_exposure"], ["days_supply"]),  # ["one-hot"]
+    ],
+)
+@pytest.mark.parametrize(
+    "enrich_var_with_feature_info",
+    [False],  # True,
+)
+@pytest.mark.parametrize(
+    "keep_date",
+    ["start", "end"],  # "interval"
+)
+def test_setup_interval_variables(
+    omop_connection_vanilla,
+    observation_table,
+    data_tables,
+    data_field_to_keep,
+    enrich_var_with_feature_info,
+    keep_date,
+):
+    num_intervals = 4
+    con = omop_connection_vanilla
+    edata = ed.io.omop.setup_obs(backend_handle=con, observation_table=observation_table)
+    edata = ed.io.omop.setup_interval_variables(
+        edata,
+        backend_handle=con,
+        data_tables=data_tables,
+        data_field_to_keep=data_field_to_keep,
+        interval_length_number=1,
+        interval_length_unit="day",
+        num_intervals=num_intervals,
+        enrich_var_with_feature_info=enrich_var_with_feature_info,
+        keep_date=keep_date,
+    )
+
+    assert isinstance(edata, ed.EHRData)
+    assert edata.n_obs == VANILLA_PERSONS_WITH_OBSERVATION_TABLE_ENTRY[observation_table]
+    assert edata.n_vars == VANILLA_NUM_CONCEPTS[data_tables[0]]
+    assert edata.r.shape[2] == num_intervals
+    assert edata.var.shape[1] == VAR_DIM_BASE + (VAR_DIM_FEATURE_INFO if enrich_var_with_feature_info else 0)
+
+
 @pytest.mark.parametrize(
     "edata, backend_handle, data_tables, data_field_to_keep, interval_length_number, interval_length_unit, num_intervals, enrich_var_with_feature_info, enrich_var_with_unit_info, expected_error",
     [

From afad70261185cff9d6347b7eba36350d000b6181 Mon Sep 17 00:00:00 2001
From: Eljas Roellin <eljas.roellin@ikmail.com>
Date: Wed, 20 Nov 2024 11:00:52 +0100
Subject: [PATCH 30/43] pypots minimal demo

---
 .../tutorial_time_series_with_pypots.ipynb    | 274 ++++++++++++++++++
 1 file changed, 274 insertions(+)
 create mode 100644 docs/notebooks/tutorial_time_series_with_pypots.ipynb

diff --git a/docs/notebooks/tutorial_time_series_with_pypots.ipynb b/docs/notebooks/tutorial_time_series_with_pypots.ipynb
new file mode 100644
index 0000000..3fbe3d2
--- /dev/null
+++ b/docs/notebooks/tutorial_time_series_with_pypots.ipynb
@@ -0,0 +1,274 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Time Series Analysis with ehrdata and PyPOTS\n",
+    "ehrdata supports the extraction of data from the OMOP Common Data Model, as well as prepared datasets such as the Physionet 2012 Challenge.\n",
+    "\n",
+    "Once data is in the ehrdata format, ehrapy can operate on the data with\n",
+    "- exploratory data analysis\n",
+    "- utility functions for time series (ep.timeseries coming soon)\n",
+    "- ...\n",
+    "\n",
+    "From ehrdata, also fast deep-learning based time series analysis can be done using e.g. [PyPOTS](https://github.com/WenjieDu/PyPOTS).\n",
+    "PyPOTS is a Python toolkit/library for reality-centric machine/deep learning and data mining on partially-observed time series, including SOTA neural network models for scientific analysis tasks of imputation/classification/clustering/forecasting/anomaly detection/cleaning on incomplete industrial (irregularly-sampled) multivariate TS with NaN missing values."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Example case: From any dataset in OMOP CDM 5.4 to applying DL for ..."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Load Libraries"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#! pip install pypots"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 58,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import duckdb\n",
+    "import ehrdata as ed\n",
+    "import pypots\n",
+    "import ehrapy as ep"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Load and extract data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO - Downloading Synthea27Nj_5.4.zip from https://github.com/OHDSI/EunomiaDatasets/raw/main/datasets/Synthea27Nj/Synthea27Nj_5.4.zip to /var/folders/yy/60ln_681745_fjjwvgwm_nyc0000gn/T/tmpfndmdvwt/Synthea27Nj_5.4.zip\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "254776f379994eeab1835ffe42fe89a1",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Output()"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO - Extracted archive Synthea27Nj_5.4.zip from /var/folders/yy/60ln_681745_fjjwvgwm_nyc0000gn/T/tmpfndmdvwt/Synthea27Nj_5.4.zip to ehrapy_data/Synthea27Nj_5.4/Synthea27Nj_5.4\n",
+      "INFO - missing tables: []\n",
+      "INFO - unused files: ['EPISODE.csv', '__MACOSX', 'EPISODE_EVENT.csv']\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "multiple units for features: []\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/eljas.roellin/Documents/ehrapy_workspace/ehrapy_venv_oct/lib/python3.11/site-packages/anndata/_core/aligned_df.py:68: ImplicitModificationWarning: Transforming to str index.\n",
+      "  warnings.warn(\"Transforming to str index.\", ImplicitModificationWarning)\n",
+      "/Users/eljas.roellin/Documents/ehrapy_workspace/ehrapy_venv_oct/lib/python3.11/site-packages/pandas/core/generic.py:3331: UserWarning: Converting non-nanosecond precision datetime values to nanosecond precision. This behavior can eventually be relaxed in xarray, as it is an artifact from pandas which is now beginning to support non-nanosecond precision values. This warning is caused by passing non-nanosecond np.datetime64 or np.timedelta64 values to the DataArray or Variable constructor; it can be silenced by converting the values to nanosecond precision ahead of time.\n",
+      "  return xarray.Dataset.from_dataframe(self)\n",
+      "/Users/eljas.roellin/Documents/ehrapy_workspace/ehrapy_venv_oct/lib/python3.11/site-packages/pandas/core/generic.py:3331: UserWarning: Converting non-nanosecond precision datetime values to nanosecond precision. This behavior can eventually be relaxed in xarray, as it is an artifact from pandas which is now beginning to support non-nanosecond precision values. This warning is caused by passing non-nanosecond np.datetime64 or np.timedelta64 values to the DataArray or Variable constructor; it can be silenced by converting the values to nanosecond precision ahead of time.\n",
+      "  return xarray.Dataset.from_dataframe(self)\n",
+      "/Users/eljas.roellin/Documents/ehrapy_workspace/ehrapy_venv_oct/lib/python3.11/site-packages/pandas/core/generic.py:3331: UserWarning: Converting non-nanosecond precision datetime values to nanosecond precision. This behavior can eventually be relaxed in xarray, as it is an artifact from pandas which is now beginning to support non-nanosecond precision values. This warning is caused by passing non-nanosecond np.datetime64 or np.timedelta64 values to the DataArray or Variable constructor; it can be silenced by converting the values to nanosecond precision ahead of time.\n",
+      "  return xarray.Dataset.from_dataframe(self)\n",
+      "/Users/eljas.roellin/Documents/ehrapy_workspace/ehrapy_venv_oct/lib/python3.11/site-packages/pandas/core/generic.py:3331: UserWarning: Converting non-nanosecond precision datetime values to nanosecond precision. This behavior can eventually be relaxed in xarray, as it is an artifact from pandas which is now beginning to support non-nanosecond precision values. This warning is caused by passing non-nanosecond np.datetime64 or np.timedelta64 values to the DataArray or Variable constructor; it can be silenced by converting the values to nanosecond precision ahead of time.\n",
+      "  return xarray.Dataset.from_dataframe(self)\n",
+      "/Users/eljas.roellin/Documents/ehrapy_workspace/ehrapy_venv_oct/lib/python3.11/site-packages/anndata/_core/aligned_df.py:68: ImplicitModificationWarning: Transforming to str index.\n",
+      "  warnings.warn(\"Transforming to str index.\", ImplicitModificationWarning)\n"
+     ]
+    }
+   ],
+   "source": [
+    "con_gi = duckdb.connect(database=\":memory:\", read_only=False)\n",
+    "ed.dt.synthea27nj_omop(\n",
+    "    con_gi,\n",
+    ")\n",
+    "edata = ed.io.omop.setup_obs(\n",
+    "    con_gi,\n",
+    "    observation_table=\"person_observation_period\",\n",
+    ")\n",
+    "edata = ed.io.omop.setup_variables(\n",
+    "    edata=edata,\n",
+    "    backend_handle=con_gi,\n",
+    "    data_tables=[\"measurement\"],\n",
+    "    data_field_to_keep=[\"value_as_number\"],\n",
+    "    interval_length_number=20,\n",
+    "    interval_length_unit=\"day\",\n",
+    "    num_intervals=10,\n",
+    "    concept_ids=\"all\",\n",
+    "    aggregation_strategy=\"last\",\n",
+    "    enrich_var_with_feature_info=True,\n",
+    "    enrich_var_with_unit_info=True,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 70,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "EHRData object with n_obs x n_var = 28 x 132, and a timeseries of 10 steps.\n",
+       "             shape of .X: (0, 0) \n",
+       "             shape of .r: (28, 132, 10) "
+      ]
+     },
+     "execution_count": 70,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "edata"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Fit Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "saits = pypots.imputation.saits.SAITS(\n",
+    "    n_steps=10,\n",
+    "    n_features=edata.shape[1],\n",
+    "    n_layers=1,\n",
+    "    d_model=10,\n",
+    "    n_heads=2,\n",
+    "    d_k=10,\n",
+    "    d_v=10,\n",
+    "    d_ffn=10,\n",
+    ")\n",
+    "\n",
+    "saits.fit({\"X\": edata.r.transpose(0, 2, 1)})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 63,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "predictions = saits.predict({\"X\": edata.r.transpose(0, 2, 1)}, return_latent_vars=True)\n",
+    "edata.obsm[\"saits_latent\"] = predictions[\"latent_vars\"][\"combining_weights\"][:, :, -1]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Show 2D representation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 69,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "... storing 'concept_id_unit' as categorical\n",
+      "... storing 'concept_id_unit' as categorical\n"
+     ]
+    },
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAABPMAAAGvCAYAAAA+FJCFAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACQWElEQVR4nOzdd3hUVf7H8c/MpFcCBJJQAoQeQIrggiAgCkEEsaOugl3QVURQ1EVkEREL6g9d7IAu9oKKBRABRRSkN+kl9IQAKaTP3N8fLLOOJDAzmZbk/Xqe+6z33nPP+c4IO8fvPcVkGIYhAAAAAAAAAAHP7O8AAAAAAAAAADiHZB4AAAAAAABQSZDMAwAAAAAAACoJknkAAAAAAABAJUEyDwAAAAAAAKgkSOYBAAAAAAAAlQTJPAAAAAAAAKCSIJkHAAAAAAAAVBIk8wAAAAAAAIBKgmQeUEksXrxYJpNJixcv9ncoKMewYcPUqFGjc5bbs2ePTCaTZs6c6fWYAAAAqhKTyaQnn3zS32EAgF+RzAOAP/n222/pIAIAAAAAAhbJPAD4k2+//VYTJkxw69k333xTW7du9XBEAAAAAAD8D8k8oJo6efKkv0OocoKDgxUaGurvMAAAqHbo15wb3xEAVB0k84CzWLx4sc4//3yFhYUpJSVFr7/+up588kmZTCaHcv/5z3/UqVMnhYeHq2bNmhoyZIj27dvnUKZXr15q06aNNm/erN69eysiIkL16tXTs88+e0a7+/fv1+DBgxUZGak6derowQcfVFFRUZkxLl++XGlpaYqNjVVERIR69uypX375xaHM6Zg3b96sG2+8UXFxcerevbvT30NhYaGefPJJNW/eXGFhYUpMTNRVV12lnTt32sucPHlSDz30kBo0aKDQ0FC1aNFCzz//vAzDcKjLZDLpvvvu05w5c9SmTRuFhoYqNTVV33///RntHjhwQLfffruSkpIUGhqqxo0ba/jw4SouLraXOXHihEaOHGlvt2nTppoyZYpsNpu9zOk16p5//nm9+OKLSk5OVnh4uHr27KmNGzfayw0bNkyvvvqqPc7Th7PKWjPvxIkTGjZsmGJjY1WjRg0NHTpUJ06ccLpOAADgqLx+zfr16zVs2DA1adJEYWFhSkhI0G233aasrKwz6vBUH8MZ27dv19VXX62EhASFhYWpfv36GjJkiLKzs+1lSktLNXHiRKWkpCg0NFSNGjXSY489dkb/r7z14ho1aqRhw4bZz2fOnCmTyaQlS5ZoxIgRqlOnjurXr2+//91336lnz56Kjo5WTEyMOnfurPfff9+hTmf6mGdTUlKimjVr6tZbbz3jXk5OjsLCwjR69GhJUnFxsZ544gl16tRJsbGxioyMVI8ePbRo0aJztlPemsVl9dkl5/rtABDogvwdABCo1qxZo7S0NCUmJmrChAmyWq3617/+pfj4eIdykyZN0rhx43TdddfpjjvuUGZmpqZNm6aLLrpIa9asUY0aNexljx8/rrS0NF111VW67rrr9Omnn+qRRx5R27Zt1b9/f0lSQUGB+vTpo/T0dN1///1KSkrSe++9px9//PGMGH/88Uf1799fnTp10vjx42U2mzVjxgxdfPHF+vnnn9WlSxeH8tdee62aNWump59++owkW3msVqsuv/xyLVy4UEOGDNEDDzyg3NxcLViwQBs3blRKSooMw9CgQYO0aNEi3X777Wrfvr3mzZunMWPG6MCBA3rxxRcd6ly6dKk+//xzjRgxQtHR0fq///s/XX311UpPT1etWrUkSQcPHlSXLl104sQJ3XXXXWrZsqUOHDigTz/9VPn5+QoJCVF+fr569uypAwcO6O6771bDhg21bNkyPfroozp06JBeeuklh3bfffdd5ebm6t5771VhYaFefvllXXzxxdqwYYPq1q2ru+++WwcPHtSCBQv03nvvOfX9nI1hGLriiiu0dOlS3XPPPWrVqpW++OILDR06tMJ1AwBQ3f21X7NgwQLt2rVLt956qxISErRp0ya98cYb2rRpk3777Td7YscbfYzyFBcXq1+/fioqKtI//vEPJSQk6MCBA5o7d65OnDih2NhYSdIdd9yhWbNm6ZprrtFDDz2k5cuXa/Lkyfrjjz/0xRdfuP0djRgxQvHx8XriiSfsI/Nmzpyp2267TampqXr00UdVo0YNrVmzRt9//71uvPFGSa73McsSHBysK6+8Up9//rlef/11hYSE2O/NmTNHRUVFGjJkiKRTyb233npLN9xwg+68807l5ubq7bffVr9+/bRixQq1b9/e7e/gz1zptwNAQDMAlGngwIFGRESEceDAAfu17du3G0FBQcbpvzp79uwxLBaLMWnSJIdnN2zYYAQFBTlc79mzpyHJePfdd+3XioqKjISEBOPqq6+2X3vppZcMScbHH39sv3by5EmjadOmhiRj0aJFhmEYhs1mM5o1a2b069fPsNls9rL5+flG48aNjUsvvdR+bfz48YYk44YbbnD5e3jnnXcMScbUqVPPuHe63Tlz5hiSjKeeesrh/jXXXGOYTCZjx44d9muSjJCQEIdr69atMyQZ06ZNs1+75ZZbDLPZbPz+++/ltjtx4kQjMjLS2LZtm8P9sWPHGhaLxUhPTzcMwzB2795tSDLCw8ON/fv328stX77ckGQ8+OCD9mv33nuv4e7/NQ4dOtRITk62n5/+Xp599ln7tdLSUqNHjx6GJGPGjBlutQMAQHVWXr8mPz//jLIffPCBIcn46aef7Nc82cc4lzVr1hiSjE8++aTcMmvXrjUkGXfccYfD9dGjRxuSjB9//NF+TZIxfvz4M+pITk42hg4daj+fMWOGIcno3r27UVpaar9+4sQJIzo62rjggguMgoIChzpOf3ZX+pjnMm/ePEOS8fXXXztcv+yyy4wmTZrYz0tLS42ioiKHMsePHzfq1q1r3HbbbQ7X//od/LX/ddrpPyenudJvB4BAxzRboAxWq1U//PCDBg8erKSkJPv1pk2b2kfQSdLnn38um82m6667TkePHrUfCQkJatas2RlTA6KiovT3v//dfh4SEqIuXbpo165d9mvffvutEhMTdc0119ivRURE6K677nKoa+3atdq+fbtuvPFGZWVl2ds+efKk+vTpo59++umMaSD33HOPy9/FZ599ptq1a+sf//jHGfdOv+H+9ttvZbFYdP/99zvcf+ihh2QYhr777juH65dccolSUlLs5+3atVNMTIz9e7DZbJozZ44GDhyo888/v9x2P/nkE/Xo0UNxcXEO3/8ll1wiq9Wqn376yeG5wYMHq169evbzLl266IILLtC3337rylfitG+//VZBQUEaPny4/ZrFYinzuwQAAK75a78mPDzc/s+FhYU6evSo/va3v0mSVq9eLcl7fYzynB55N2/ePOXn55dZ5nQ/ZNSoUQ7XH3roIUnSN99841RbZbnzzjtlsVjs5wsWLFBubq7Gjh2rsLAwh7KnP7s7fczyXHzxxapdu7Y++ugj+7Xjx49rwYIFuv766+3XLBaLfeSezWbTsWPHVFpaqvPPP9/+766iXO23A0AgY5otUIaMjAwVFBSoadOmZ9z787Xt27fLMAw1a9aszHqCg4MdzuvXr3/G2h1xcXFav369/Xzv3r1q2rTpGeVatGjhcL59+3ZJOuuUzezsbMXFxdnPGzduXG7Z8uzcuVMtWrRQUFD5/3exd+9eJSUlKTo62uF6q1at7Pf/rGHDhmfUERcXp+PHj0uSMjMzlZOTozZt2pw1tu3bt2v9+vVnTH0+LSMjw+G8rH9PzZs318cff3zWdty1d+9eJSYmKioqyuH6X/9dAgAA1/21X3Ps2DFNmDBBH3744Rl9gNPr03mrj3G2GEeNGqWpU6dq9uzZ6tGjhwYNGqS///3v9kTf3r17ZTabz+h3JiQkqEaNGmf0o1zx1+/o9HrHZ/v87vQxyxMUFKSrr75a77//voqKihQaGqrPP/9cJSUlDsk8SZo1a5ZeeOEFbdmyRSUlJeV+Bne52m8HgEBGMg+oAJvNJpPJpO+++87hredpf03ilFVGktPr1/21bUl67rnnyl1H5K/t//mNtT956nuw2Wy69NJL9fDDD5d5v3nz5i7HBgAAKoe/9muuu+46LVu2TGPGjFH79u0VFRUlm82mtLQ0lzet8GQf44UXXtCwYcP05Zdfav78+br//vs1efJk/fbbbw6bUriy6dZfWa3WMq+70/dzp495NkOGDNHrr7+u7777ToMHD9bHH3+sli1b6rzzzrOX+c9//qNhw4Zp8ODBGjNmjOrUqSOLxaLJkyc7bLhWlvK+t79+J6722wEgkJHMA8pQp04dhYWFaceOHWfc+/O105s/NG7c2GOJo+TkZG3cuFGGYTh0TrZu3epQ7vQ01ZiYGF1yySUeabssKSkpWr58uUpKSsp9Y5mcnKwffvhBubm5DqPztmzZYr/vivj4eMXExDjsNFtebHl5eU5//tNvmv9s27ZtDjugVaQj/VfJyclauHCh8vLyHDqIf/13CQAAKub48eNauHChJkyYoCeeeMJ+/a+//d7qY5xL27Zt1bZtW/3zn//UsmXLdOGFF+q1117TU089peTkZNlsNm3fvt0+q0GSjhw5ohMnTjj0o+Li4nTixAmHuouLi3Xo0CGn4jjdf9y4cWOZM1D+XMZTfcyLLrpIiYmJ+uijj9S9e3f9+OOPevzxxx3KfPrpp2rSpIk+//xzh77Y+PHjz1l/Wd+JdObMEG/02wHAX1gzDyiDxWLRJZdcojlz5ujgwYP26zt27HBY/+2qq66SxWLRhAkTzhhVZhiGsrKyXG77sssu08GDB/Xpp5/ar+Xn5+uNN95wKNepUyelpKTo+eefV15e3hn1ZGZmutx2Wa6++modPXpUr7zyyhn3Tn/myy67TFar9YwyL774okwmk8M6g84wm80aPHiwvv76a61cubLcdq+77jr9+uuvmjdv3hllTpw4odLSUodrc+bM0YEDB+znK1as0PLlyx3ii4yMtD9fUZdddplKS0s1ffp0+zWr1app06ZVuG4AAPA/p0da/bU/9tddZ73VxyhPTk7OGWXbtm0rs9msoqIiSaf6C2XFOnXqVEnSgAED7NdSUlLOWK/vjTfeKHdk3l/17dtX0dHRmjx5sgoLCx3unf7snu5jms1mXXPNNfr666/13nvvqbS09IwptmX9+1u+fLl+/fXXc9afkpKi7Oxsh2VrDh06dMYuwN7otwOAvzAyDyjHk08+qfnz5+vCCy/U8OHD7cmqNm3aaO3atZJOdR6eeuopPfroo9qzZ48GDx6s6Oho7d69W1988YXuuusujR492qV277zzTr3yyiu65ZZbtGrVKiUmJuq9995TRESEQzmz2ay33npL/fv3V2pqqm699VbVq1dPBw4c0KJFixQTE6Ovv/66wt/DLbfconfffVejRo3SihUr1KNHD508eVI//PCDRowYoSuuuEIDBw5U79699fjjj2vPnj0677zzNH/+fH355ZcaOXKkw2YXznr66ac1f/589ezZU3fddZdatWqlQ4cO6ZNPPtHSpUtVo0YNjRkzRl999ZUuv/xyDRs2TJ06ddLJkye1YcMGffrpp9qzZ49q165tr7Np06bq3r27hg8frqKiIr300kuqVauWwxSaTp06SZLuv/9+9evXTxaLRUOGDHHruxs4cKAuvPBCjR07Vnv27FHr1q31+eef29ftAQAAnhETE6OLLrpIzz77rEpKSlSvXj3Nnz9fu3fvPqOsN/oY5fnxxx9133336dprr1Xz5s1VWlqq9957TxaLRVdffbUk6bzzztPQoUP1xhtv6MSJE+rZs6dWrFihWbNmafDgwerdu7e9vjvuuEP33HOPrr76al166aVat26d5s2b51Qsp7+nF198UXfccYc6d+6sG2+8UXFxcVq3bp3y8/M1a9Ysr/Qxr7/+ek2bNk3jx49X27ZtHUYgStLll1+uzz//XFdeeaUGDBig3bt367XXXlPr1q3LTCj+2ZAhQ/TII4/oyiuv1P3336/8/HxNnz5dzZs3d9g8wxv9dgDwG19vnwtUJgsXLjQ6dOhghISEGCkpKcZbb71lPPTQQ0ZYWJhDuc8++8zo3r27ERkZaURGRhotW7Y07r33XmPr1q32Mj179jRSU1PPaGPo0KFGcnKyw7W9e/cagwYNMiIiIozatWsbDzzwgPH9998bkoxFixY5lF2zZo1x1VVXGbVq1TJCQ0ON5ORk47rrrjMWLlxoLzN+/HhDkpGZmenW95Cfn288/vjjRuPGjY3g4GAjISHBuOaaa4ydO3fay+Tm5hoPPvigkZSUZAQHBxvNmjUznnvuOcNmsznUJcm49957z2gjOTnZGDp06Bnfwy233GLEx8cboaGhRpMmTYx7773XKCoqcmj30UcfNZo2bWqEhIQYtWvXNrp162Y8//zzRnFxsWEYhrF7925DkvHcc88ZL7zwgtGgQQMjNDTU6NGjh7Fu3TqHNktLS41//OMfRnx8vGEymQxX/m+yrH+XWVlZxs0332zExMQYsbGxxs0332ysWbPGkGTMmDHD6boBAMAp5fVr9u/fb1x55ZVGjRo1jNjYWOPaa681Dh48aEgyxo8f71DWU32Mc9m1a5dx2223GSkpKUZYWJhRs2ZNo3fv3sYPP/zgUK6kpMSYMGGCva/VoEED49FHHzUKCwsdylmtVuORRx4xateubURERBj9+vUzduzYcUY/asaMGYYk4/fffy8zrq+++sro1q2bER4ebsTExBhdunQxPvjgA4cyzvQxnWWz2YwGDRoYkoynnnqqzPtPP/20kZycbISGhhodOnQw5s6dW2bfqqx/n/PnzzfatGljhISEGC1atDD+85//2P+c/JUz/XYACHQmw3Bj5X2gGhs8eLA2bdpU5vprCEx79uxR48aN9dxzz/HGFQAAAABQqbFmHnAWBQUFDufbt2/Xt99+q169evknIAAAAAAAUK2xZh5wFk2aNNGwYcPUpEkT7d27V9OnT1dISIjDGmuVVXFxsY4dO3bWMrGxsQoPD/dRRIHr2LFjKi4uLve+xWJRfHy8DyMCAAD+Vp37B1ar9ZwbYURFRSkqKspHEQFA9UIyDziLtLQ0ffDBBzp8+LBCQ0PVtWtXPf3002rWrJm/Q6uwZcuWOSyoXJYZM2Zo2LBhvgkogF111VVasmRJufeTk5O1Z88e3wUEAAD8rjr3D/bt26fGjRuftcz48eP15JNP+iYgAKhmWDMPqKaOHz+uVatWnbVMamqqEhMTfRRR4Fq1apWOHz9e7v3w8HBdeOGFPowIAAD4W3XuHxQWFmrp0qVnLdOkSRM1adLERxEBQPVCMg8AAAAAAACoJNgAAwAAAAAAAKgk/LJmns1m08GDBxUdHS2TyeSPEAAAwDkYhqHc3FwlJSXJbHZ8/1dYWHjWhd//KiQkRGFhYZ4OER5C3wwAgMrhbP0zVB9+SeYdPHhQDRo08EfTAADARfv27VP9+vXt54WFhWqcHKXDGVan60hISNDu3btJ6AUo+mYAAFQuf+2foXrxSzIvOjpa0qk/fDExMf4IAQCAs9q+aqf+OegZFeYV+TWO8/udp0dnP6CgYN//ZOfk5KhBgwb23+3TiouLdTjDqt2rkhUTfe43wjm5NjXutFfFxcUk8wIUfTMAQGWw6ecD+uWzHf4OQ10ub6z2lzT0S9vl9c9QvfglmXd6+kZMTAwdRgBAwDl+5IQmXz9NpSdtCjIF+zWWtfM3a/b4z/WPV+7wWwzlTbuMiTY7lcxD4KNvBgAIdOmbs7R67iGFh0T6OxRtmJ+heo3rKKVDHb/FwLIY1Rs9cAAA/uLl4W8o+2iuv8Ow+3r6fK1dtNHfYZzBaticPgAAANxVXFCqRe9tkQx/R/I/S97fqoI859cPBjyJZB4AAH/y4/s/65c5v/s7DAeGYeiF2/+tgrwCf4fiwCbD6QMAAMBdSz/drrzj/l365K8Kckv00wfb/B0GqimSeQAA/NfxIyf06gMz/B1GmQ7vydSbj8z2dxgAAAA+lb45S3/8csjfYZRpx6oM7Vyd4e8wUA35Zc08AAACiWHLk0o2asvidzVi4kY1a5uvmnVLFRJqk8kklRSblXvCop2bwrV9fbi2r4vQtnXhOp7p2/X0vnljgYY8coXqNIz3abvlsckmZybQOlcKAADgT3IOSQfX6Lf3SiVF+Tuacv325S6ldPTf2nmonkjmAQCqLaN4lYz896XC7yWV6IKeZZezBNkUFmFTfFKJ/nZpjiTJZpPW/RKlr2fW0q/zYmWzeX8RYpvVpq9fW6Dbn77R6205w2oYshrnnkLrTBkAAABZS6Q/vpZ+f1vau1RHipsp8/iz/o7qrE4cyde+LcfUoGVNf4eCaoRkHgCgWjEMQyr4XEb+LKl0i9v1mM1Shx556tAjT5kHg/Xtf2rps9drq6jAUqH4YmqWqnm7fDVrV6AmrQsUEWNVSKghm9Wk4iKTcrNfU2mOTZbw86SgFjKZQirUHgAAgN8V5UrLpkmrZkp5R+yXN+an+S8mFyyZvVVdBjZWfMNo1agbwU6z8DqSeQCAasMoTZeR/ahU4tkNLuKTSjT04cO69LpjmjqqgTb85tpUkLr1izXglqPqecUJJTQoOUfpXCn/XzLyJSlYRnBbmcKvk8IHyGQKdfcjuMXZzS3YAAMAAJRr54/SV/dL2fscLhfaorS98EI/BeWa7MwCLXhnsyQpJMyi+q1qqk2PeqrfKo7EHryCZB4AoMozDEPKf09G3guS4b0dYZMaFevZT3fq65m19PakxHOO0ut8cY4GDjuq83vnyuLWgL4SqWS1jJLVUu4zMsKvlCniJpmCGroVv6tsMmQlmQcAANxRlCvNe1xaPavM2+lF7WWVb19UekJxoVW71mRq15pM1agbodQeSWp1YZJCw0m/wHP40wQAqNIMo1jG8buk4mU+ac9slq64LUvtL8zTozekKOvwmZtkxCcVa+Tz+3R+rzzPNWyckPJnyMj/jxR1nxR5l0ymik35PRdG5gEAALcc2y3NulzK3l9ukYySpj4MyDtOHMnXL5/u0JoF6ep1U0s1blfb3yGhiiCZBwCocozi32UU/SyVrJOKf5dU6vMYklsUaeqcHXr42iY6su9/b5X735ilO584qMgYb+3wWiIj70WpcIEUO0Wm4GZeagcAAMBJpcXS1m+kvcukvb9KRzZK53jZl1mS4pvYfCA/u1jf/nu9ml9QVz2ua66wyDNf9gKuIJkHAKgSDFueVDBHRsEHUul2f4cjSUpoWKwpH+/SqMFNdTLHrMdeS7fvhut1pRtlZA2WYsbJFDHEK02wmy0AADirE+nSynekNf+RTmY6/ZhhSEdLG3sxMP/YtvyI9m85rsvuaae6jWP8HQ4qMZJ5AIBKzTCKZeS9KuW/Kxkn/R3OGRKTi/X0+7tUkG9S607eW6+vbCUycp6QbDkyRd3l8dpt/z2cKQcAAKqR3MPSd49If3wlGa73BPJstVVsRHohMP/Lzy7Wly+t0WUj2ql+izh/h4NKyuzvAAAAcJdRsl5G1pXSyekBmcg7rXGrQj8k8v7HyHtexsm3/dY+AACoRtZ+IL16gbR5jluJPEkqtoV7NqYAU1Jk1TevrtPBHSf8HQoqKZJ5AIBKxzBKZct9QUbW9QEzpTbQGblTZBR86dE6rf/dzdaZAwAAVHF5GdL710tz7pEKT1SoKpuq/ppypcU2ffPqeh07FLgvpBG4SOYBACoVwyiSceIf0snXJVn9HU6lYuQ8KcN60GP1WQ3nDwAAUIUd3yu93Vfa9r1HqjP7YfMyfyguKNXCmZtls9FZgmtI5gEAKg3DKJZx/G6paKG/Q6mcjJMysh/3dxQAAKAqOb5HmtFfOr7bY1VaTMUeqyvQZezN1Zr5e/0dBioZknkAgErBMEplnHhAKl7m71Aqt+JfZOR/6JGqbC4cAACgCso9LL17hZRzwKPVRluOyqwSj9YZyFbM3a2sg3n+DgOVCMk8AEClYOT9mxF5HmLkPivDVvEOo00mWZ04bDJ5IGoAABBQDEP69PZTI/M8zGIqVc2gfR6vN1DZSg0t/Zh1oOE8knkAgIBnlGz+7xp58AgjTyqY4+8oAABAZbbiTWnvUq9VHx+802t1B6L9W47r+GE2w4BzSOYBAAKaYZTIyB4rVaOpFr5gFHxQ4TpshvMHAACoQo7tln540qtN1KlmyTxJ2rDEs9OVUXWRzAMABLaTb0qlW/wdRdVTul1G8YoKVeHMFNvTBwAAqEK++odU4t1RZPVCNni1/kC09bfDKimy+jsMVAIk8wAAAcswimScnOnvMKoso4JTbUnmAQBQDe1fKe352evNxAUdrHYJveKCUu1el+nvMFAJkMwDAASuwm8k44S/o6i6Stb7O4IyWa1WjRs3To0bN1Z4eLhSUlI0ceJEGcb/5usOGzZMJpPJ4UhLS3OoZ9CgQWrYsKHCwsKUmJiom2++WQcPHnQos379evXo0UNhYWFq0KCBnn32WZ98RgAAKq3f3/JZU20ivvdZW4HiyJ4cf4eASiDI3wEAAFAeI/99f4dQtZXulGEUyGQKd+txm2GSzTj3qDtnyvzZlClTNH36dM2aNUupqalauXKlbr31VsXGxur++++3l0tLS9OMGTPs56GhoQ719O7dW4899pgSExN14MABjR49Wtdcc42WLVsmScrJyVHfvn11ySWX6LXXXtOGDRt02223qUaNGrrrrrtcihkAgGoh/5i08XOfNdck9DdFmI8p31bTZ236W+beXH+HgEqAZB4AICAZJZsDduRY1WGVSrZIIR3cfNq5KbSuTrNdtmyZrrjiCg0YMECS1KhRI33wwQdascJxjb/Q0FAlJCSUW8+DDz5o/+fk5GSNHTtWgwcPVklJiYKDgzV79mwVFxfrnXfeUUhIiFJTU7V27VpNnTqVZB4AAGVZ8x/JWuSz5swmm9pHfqllubf6rE1/y9yfJ8NmyGRmmRKUj2m2AIDAVLzM3xFUDyWbfNZUTk6Ow1FUVPZ/DHTr1k0LFy7Utm3bJEnr1q3T0qVL1b9/f4dyixcvVp06ddSiRQsNHz5cWVlZ5bZ97NgxzZ49W926dVNwcLAk6ddff9VFF12kkJAQe7l+/fpp69atOn78eEU/LgAAVc/uJT5v8ryIuaoTvM3n7fpLaZFVx4/k+zsMBDiSeQCAgGSUbPR3CNVDBdYktMrs9CFJDRo0UGxsrP2YPHlymfWOHTtWQ4YMUcuWLRUcHKwOHTpo5MiRuummm+xl0tLS9O6772rhwoWaMmWKlixZov79+8tqddwB7pFHHlFkZKRq1aql9PR0ffnll/Z7hw8fVt26dR3Knz4/fPiw298LAABV1sG1Pm/SbLKpT+w0WVTs87b9pfBkib9DQIBjmi0AIDD5cMRYdWYYRW7vNWs4uWae8d8y+/btU0xMjP36X9e4O+3jjz/W7Nmz9f7779unvo4cOVJJSUkaOnSoJGnIkCH28m3btlW7du2UkpKixYsXq0+fPvZ7Y8aM0e233669e/dqwoQJuuWWWzR37lyZTExdAQDAJdn7pfyjfmm6ZtB+JQRv0YGSdn5p39esJTZ/h4AARzIPABBwDFuuZE33dxjwsJiYGIdkXnnGjBljH50nnUrW7d27V5MnT7Yn8/6qSZMmql27tnbs2OGQzKtdu7Zq166t5s2bq1WrVmrQoIF+++03de3aVQkJCTpy5IhDPafPz7YWHwAA1ZIfRuVJp14KLsm5q9ok8gBnMM0WABB4rPskGf6OolowmcLcfvb0BhjOHK7Iz8+X2ezYRbFYLLLZyn9LvX//fmVlZSkxMbHcMqefP71WX9euXfXTTz+ppOR/U1kWLFigFi1aKC4uzqWYAQCo8o7v9kuzS3Lu1qaCNL+07S9BwaRqcHb8CQEABB6jwN8RVBsZB9yfxmE1zE4frhg4cKAmTZqkb775Rnv27NEXX3yhqVOn6sorr5Qk5eXlacyYMfrtt9+0Z88eLVy4UFdccYWaNm2qfv36SZKWL1+uV155RWvXrtXevXv1448/6oYbblBKSoq6du0qSbrxxhsVEhKi22+/XZs2bdJHH32kl19+WaNGjXL7OwEAoMoq8X3/bHnuDdpU0M/n7fpbSUihv0NAgCOZBwAIQNZzF4FHPHf7Iq1b7N76hDaZZJPZicO1kXnTpk3TNddcoxEjRqhVq1YaPXq07r77bk2cOFHSqVF669ev16BBg9S8eXPdfvvt6tSpk37++Wf7OnwRERH6/PPP1adPH7Vo0UK333672rVrpyVLltjLxMbGav78+dq9e7c6deqkhx56SE888YTuuusut74PAACqNFupT5s7WNxKq05e7dM2A0GRpUD3r7pHR04eOXdhVFsmwzB8Po8pJydHsbGxys7OdmrtHABA9WIUr5Fx7Hp/h1HllZZIg5u3lckUqqfmPqoOF7d1uF/e7/Xp69+sb6LIaMs52zmZa9WAdrv43Q9g9M0AAOf003PSj0/5pKlSI0QfHn1R2dYkn7QXSA7GbNdXqa+oflR9zeo/S3Ui6jjc5zcbEiPzAACByBTh7wiqhb3bwlRSZFZxYYnGD35WW3/f4dLz3lozDwAABKCQKJ819VvuTdUykSdJmZH7JUn78/br7gV3K7so288RIRCRzAMABJ6gRmLDde/bsirS/s8FeYWaeN1UFeQ5vx6Ot9bMAwAAASi+hU+aySppoPX5A3zSViA6Er3H/s87TuzQhF8n+C8YBCx61wCAgGMyhUpBTf0dRpU3/2PHHVuP7M3UG2Pe81M0AAAgoCW290kzG/P7y9C5l/GoigqC8rQ3bqPDtQV7F2jennl+igiBimQeACAwBbfxdwRV2vb14dqyOvKM69+88YNWL9zgVB2nNsBw7gAAAJVcRE2pRkOvNlFsC9PWwp5ebSOQbY1fLqv5zI1Gnl7+tI4VHvNDRAhUJPMAAAHJFJzq7xCqtK9n1SrzumEYmnrHdBUXlZyzDpvMsjpx2OhuAABQNXh5dN7Wwl4qMarn2smGbNqU8EuZ944VHtOUFVN8HBECGb1rAEBgCu0tfqa8I/uYRYvnxJV7/8jeTP30ya8+jAgAAFQKLS7zavVbCnp7tf5All7jD+WGZZV7f96eeTp88rAPI0Ig47+SAAAByWRJUsZhRud5w/Rx9VRUcPYuwNfTz702CxtgAABQzaReqWJzrFeqthpBOlrS2Ct1B7oSc5F+afzZWctYDas+3vqxjyJCoKN3DQAISEf2Zuq1f557qidc88t3MVr0Rfmj8k7b/Os27Vy/56xlbP+dQuvMAQAAKr8d63O0KbeXV+rOKk2WTcFeqTvQLW84VzlnGZV32ufbP1eJjf4xSOYBAALU1Dun65dvQ3Rwd4i/Q6kyTuaYNW1sfafLz5+xyIvRAACAyqQgr1g/fbhVG/PTZBie39wqoyTF43VWBgejd2hjwk9Olc0qzNKSfUu8HBEqA5J5AICA880bC7T6hw2STHr3+QR/h1NlfPpavI5nOv/Ge+MvW85632qYnD4AAEDl9tOH21SQW6Ica4L+KOjj8fozS5p4vM5Alx+cq0VNZ0sudJXWZKzxXkCoNEjmAQACSmlJqd598n/rgSz6Ik7Lvo/xY0RVx9Y1ru0Od3D72RdZdmYn29MHAACovI7uz9WOlRn2819yhynXWsujbRTYvLMWX6AqsuTrm1bTlRt2zKXnth7f6qWIUJnQuwYABJSlny/XscMnHK793yP1lXPM4p+AqpCiQtd+9m024+z3DbPTBwAAqLw2LDngcF5sRGpx9giPtmE1qs/SKgVBeZrberqyIg+cu/Bf7Di+wwsRobKhdw0ACChflbGL6vHMYP17XD0/RFO15OeREAUAAK4pLijVthVHzrieXtxRm/Iv8UNElVt2WKa+Sv0/ZUalu/V8obXQwxGhMgrydwAAAJy2Z9M+bfjpjzLvLfoiTvVTivT3UWd2JnFuJcUm7dse6tE6nZ1Ca9XZR/gBAIDAteW3QyotspZ576ecuxRjyVCD0PUVbsdiKq5wHYHMkE0bEn7SioZzVWphR1pUDCPzAAABY9mXv5/1/nvPJ+jT1+J9FE3Vsm9niEqKPfuzb5Nzm2DYPNoqAADwpd3rjpZ7z6ZgfXviUR0sbl3hdsLNORWuI1Blh2Xqy9RpWtb4C8dEHu874SaSeQCAgLFt1c5zlnnzX0l697m6PoimaklqVKyu/bL9HQYAAKhkMtNzz3q/1AjTV8ee0N6iDhVqJz54V4WeD0RHovbox5T/6OPzntHhmDM/X+d9/RWXn+CHyFDZkcwDAASM7auc68TNfjFB44c1UtZhVotwVli4oSdn7NEjr+xVdI1Sj9Rpk9npAwAAVD7Zmfkqyj93v8GqUH1z/HH9lnuTrIZ7/bNCa7RbzwWa0MJjMhcs0xepz+mLti9qW53fZTWX/R2mZHXQ1etHq8OBS2QyTD6OFJUZ/xUEAAgI2UdzlJFe/jSOv/ptfqw2Lo/U8IkHdck1x70YWdVy8VUn1LrzST16fYoO7qnYGnpWwyyrEzvVOlMGAAAEnoy9Zx+V92eGLFp18hrtLuqsPrHTVCf43DMuTsu3xmrtyYHuhOg/hk3xmWtlsRUrIj9D0Xn7FJ2brpCSPElS5DGTXrjKLJu57CRdrZNJqlF4arbJBekDlZjdVPNbvM16enAKvWsAQEDYs2mfy8/kZQfpufsb6olbGmvL6ggvRFU1JTQo0fNf7FByc3ZDAwAA5Tt26KTrz5Qm69OsKVqaM0y5VufWOl6Sc4+KFONyW/4UnbdPbTe/rdZb3lOj9HmqdWyzPZEnSZ23G7p3rk0yyl4Yr/WRCx3OG2a30oA/RijY6tkNy1A1kcwDAASEwjz3E0vLf4jRA5c30+irm+jEUYsHo6q6atUt1eQPdyqhYZHbddhkcvoAAACVT0k5u9ieiyGL1uVfofcyp+v97Ee01NpWtnKmke4s/Jt2Ff2tImH6RcLh5ecs02OTob6rz0zmhZSGqXlm5zOuJ+Y2UdqWO2SxlT+JMj6czeDANFsAgI/lnTipbat2afuqXdq5brdyj+WppKhU2UcrtoNZWIRVtz92WDVqu9fprI5qJZTqqdm7NeLS5iouPPP9XnhMmHSWfy1MswUAoGrIPVaozPRcZabn6tjBkyopKpW11FB2ZkGF6s00mzQt+DwVlJynespUJ/M2tTXvVjvzLrU27VW0qUCrT17poU/hOxZrkRKdSOZJ0t8X2bQmxaTMGv9LZjY72lnBtrJH4NXLaa6Ldl2vRU1nl3m/Zc2W+kk/uR40qhSSeQAArzt68Ji+e3OhFr7/sw5sP+SFFgw9/vpeteqU74W6q7YGKUW6dewhvf5kvTPuNWnXSFrq+5gAAID3Ze7L1cbF+7V7/VEV5Hp+nbZ8k6FPoopU8N93egcUrwO2eH1lOz291FCT0iJdXRLn8ba9re6RFQqyOjerJKxEGv6tTf+6wSyZTAotjVDH/Zee9ZkWmV20q+Za7a256cx7NVu4FTOqFpJ5AACvWbd4k7589Tst+3KlrKXeGzF3+S1Z6tLH+QWa4eiK249q6bex2rQiyuF6t4Gd9epZknlWmWV1YsUOZ8oAAADvs1pt2vH7EW1YckBHdldsVsS5LAgvVt5ZuwAmNSuuhDvYGjbVP+DayLg2ew31WWdoYXuTLtx9lSJLYs/5zEW7rtdHMZNVHOQ4OrJ3/d4utY2qiWQeAMDjjmdka9q9b+rnz5ybflARdesX6/Z/emO0X/VhsUgPTd2n4Ze2UNF/X5+HR4Wp940XSo+U/5zNMJW7/s1fywEAAP/KTM/VwlmblXXA9U0tXLUluFTbQmxnLRNiSC2LK99ax/UP/KSokwddfm7gcpt2Nmir5kfPXCuvLJElseq++2r92Ow/9mvn1z1fKXEpLreNqodX5QAAj1r80S+6s82DPknkSYZGTd2niKizdxZxbvWaFKvv9cfs5xff2EORMWffIdj235F55zpsdDcAAPAba6lNy7/apU+fWemTRF6+DP0Qfu5pu4mlZoVUsk2ywgsylbJrjlvPJh2TrlzZ0aVnmh/trBr5de3n17e83q22UfXQuwYAeERJcYmeufn/NOmGl5R91DdTXrul5ah99zyftFUdXH5Llv2fB43o58dIAACAJ+QdL9Qnz6zUym/3yGY7c1dVbwiSFG89d6qhrhNlAophqOWW/8hic399wabpq11+JvVId0mndrHt07CP222jaqlkf3sAAIGoML9I4wZN0cLZP/u03UG3HvVpe1Vdo5aFavu3PPW5qYeatEs+Z3mbYXb6AAAAvnXiSL4+f261svb79sVniEy6+mSImpac/fc/obIl82QoJndPhWqodXSDQopOuPRM88zOCrKG6B8d/qFgc3CF2kfVUdn+9gAAAkxJcYnGD56iVfPX+bTd+imFOu9CRuV52tX35GjEy7c6VdYqk9MHAADwnZysAn350hrlHnNux1VPC5JJg06GqPFZEnp1rZWsf2AyKy+yfoWqMMumGtk7XXom1Bqu/qVDdGWzKyvUNqoWknkAgAp5dugrWv3DBp+3e/ktWTLzK+ZxXfrkKDru7GvlAQCAwFWUX6KvXl6rvONFfo3DIpOuOBmihNKyk3axtkqWzJNUGF6rwnVE5+5z+ZmO+T0r3C6qFv4zCADgtgXvLtHij5b5pe1eg0/4pd2qzmIpkkqde2PMNFsAAALP0o+3KzujwN9hSJKCZdJl+SGy/GW5PoshmSrhyH2bB6a5Ruemu/xMzgH/JmYReOhdAwDcknXouKY/OMMvbdepV6y4+FK/tF0tlG50qphVzk61BQAAvrBnw1Ft+e2wv8NwUMtmVvfCIIdrlTURYTNVPPLoPNeTeQW5JX6bMo3AVFn/DgEA/Oylu19X7vGTfmm7Wbt8v7RbXRglm/wdAgAAcFFRfokW/2eLv8Mo0/lFQUoq/V/6obK+kjXbKh55cGmBgkpc78tmpudWuG1UHSTzAAAuW/7NKv02d5Xf2m/WLjCmjlRZpdudKsY0WwAAAsfv3+7Ryexif4dRJrNMuqTgf1NUDZNUIuMsTwSmIKtnpruabSUuP3PsIBu/4X/oXQMAXDbnle/82n6TVJJ5XmU4N+LSapidPgAAgPeUFFu1Zdkhf4dxVnWtZtX70+i8o39dSK8SiDx50CP1mAyby8+UFLFwCf6H3jUAwCUHdx7Wqvnr/RpDdA06M15lBOZbfQAAULbtK46oKD/wJ6+2L7JIkmJsJkW5ns/yq6DSfIUXZHqkLps56NyF/sJaUvmSn/Ae1/8EAQCqta+nz5dh+LczERxCZ8a7nNupzZBJNid2ojMq4W51AABUJhuW7Pd3CE5pXmLRwUKbehQGK7SS9Q+icvd5LGKrJdTlZ8yWyvV9wbtI5gEAXPLDf37ydwjycy6x6jM518F0dgot02wBAPCerIN5OrqvcqynFiSTLikM8XcYbonJ3eeRevLDastmcf07sITQn8L/kMwDADjt8J4MncjI9ncYKi6kM+NVlgZOFbMZJtmMc78ldqYMAABwz5HdOf4OoVqIP7rWI/XkRjd067nY2uEeaR9VA/81BABw2raVO/0dgiTp2BHeRXmTKbiNv0MAAABOytyb6+8Qqryo3HTF5uz2SF3uJvPiG0Z7pH1UDSTzAABO275ql79DkCRt3xDh7xCqtuBUp4pZZXb6AAAA3pGRTjLP2+od/NljdeVGOzcD4s+CQsyKS4z0WAyo/BjaAABw2o61nnkjWVHb1zPNwHvMUlArp0oyzRYAAP8ybIayDlSO9fIqq6DSfCUcWemRuoqDo3QitqnLz9WuHyWzmf4U/odX5QAAp2UfDYw3vyTzvCj4PJnMjHwEAKAyKCmyylpi83cYVVqjPd/LYiv2SF0HE7vJMLs+pqpeiziPtI+qg2QeAMBpJYUl/g5BkpSXHaS9W53bcRWuMUVc73RZm8xOHwAAwPNKSeR5VUz2LjXY/6NH6jJk0sGk7i4/ZzJJrbsneSQGVB30rgEAldK8D2v6O4Sqx1RDChvgdHGrYXL6AAAAqEzM1mK12vKeTDI8Ul9WrVQVhtVy+bnkNrUUU4tZKXBEMg8A4LSQ8BB/h2A3/6OaKiwgSeRR4VfLZGLEIwAAlUVQCP9J7y1Ndn+lyIIMj9RlM1m0q/FAt55t07O+R2JA1cLffACA02LjY/wdgl3uiSD99FUNf4dRdZjCZYr8u0uPnN4Aw5kDAAB4XnCIhYSeF9Q7sEQN9y/yWH17kvspL8r1pFyt+lFqmMpsFJyJv/UAAKc169DY3yE4mPN2bRksFeMRpqiHZLLUc+kZwzDL5sRhGHQ3AADwBpPZpNr1o/wdRpWSeGiZmm//2GP15UbV196GaS4/Z7aY1GdoK5lMvBTFmehdAwCc1vz8FH+H4GDnxgjlHLf4O4zKL7izFHGzv6MAAABuiE8OnJkTlV2DfQvVcutseSp9ZjUF6Y+WN8swu95f7ZiWrPgG0R6KBFWN63siAwCqrWadmvg7BAeh4TZFxVr9HUblZgqXKXayW299rTLJ6kR315kyAADAPXUakvCpqODiXLXY/pHqZK7xWJ02k1mbUm9ze3rt+Zc18lgsqHoYmQcAcFqdBrVVKynO32HYpaQWyMJrqQqwyBT7rExBDd162mY4u26eh8MGAAB2dRszMq+iWm55z+OJvD9a3qKjtc9z+dnwmBCl3dlGFgvpGpSPPx0AAJf0HdrL3yHYNW2X7+8QKjGTTDFPyRTWz+0anFkv7/QBAAC8Iy4hkoReBRWGx3usLqs5WBtT79SRup1dfjY0IkiD7j9PNepGeCweVE30rgEALrn87ktlDpA3hXXqlfg7hEoqSKbY52WKuNrfgQAAAA9o29O1TazgqDDMMzNPcqKTtbLTwzpau53Lz0bEhujKhzqqdn2mTePcAuO/xgAAlUadhvG6YEBHf4chSQoJZStbl5kTZYp7S6bwgRWuyiaT0wcAAPCepp3qKiwq2N9hVFo2c8W+O6spSDsbD9LKjqN1MjLJ5ecTm8bq6jGdVKseOxPDOaw0BABw2eD7+uvXr1b6OwyZeSXlmvBrZYp+VCazZzqKVsMkq+HEBhhOlAEAAO6zBJvVunuSVn+/19+hVFLudSptpiBlxp+n3cn9lR+Z6PLzQSFm/e2KFLXrXV8mM/0lOI9kHgDAZR0vaaee13XVko9/9WscJcV0epxibiBT7JMyhfbwdyQAAMBLOqUla/vvR5SbVejvUCodk+Ha0i2FoXE6kNRdBxO7qSTEvfUK67eMU88bW6hGHdbHg+tI5gEA3PKPV+7QusWbdSIj228x5OdZ/NZ2pWGKkin+B5lMnk98Oru5BRtgAADgfSFhQbr4llb68qU1EjvJuyQ7urEO1+2s6Nx9Ci/IlNmw2u9ZzcE6GZmo3KiGyo1uqJzoBsqLqi+Z3O/f1G9VU1c80N4DkaO6IpkHAHBLbO0Y3f/qHfrXtS/4LYbdm8P81nalEdzeK4k86b9r5jkxhZY18wAA8I36LeLUpkc9bfzpgL9DqVRyYxtrc2zj/10wbDLbrLKZgyQv9KMatPTMhhuovnhVDgBwW4+r/6a02y72W/vb1zMt4ZyCU/0dAQAA8KGuV6WoVr1If4dRuZnMslmCvZLIk6T4huxYi4ohmQcAqJCRr9+lC6/s4pe2j+wPUfYxptqejSm4jdfqNpzcydZgZB4AAD4TEhakgfe3V2x8uL9DQTlI5qGiSOYBACrEYrHo8Q9Gqsc1f/NL+9vXMTqvXKYIKeRCr1VvM0xOHwAAwHciY0M1eFQHxSXQTwo0Sc1qKCwy2N9hoJIjmQcAqLDgkGA9/sFIDRrRz+dt/zQ31udtVhphA2UyR/k7CgAA4AdRcWG68qGOSmxKXymQtOlZz98hoAogmQcA8AiLxaJ/vHKHnv7uccU3qOWzdhfPiVPuCabalsUUcaNX6z+9m60zBwAA8L3w6BBdOaqjLrymqYKC+T32t4iYEDXpEO/vMFAF8LcZAOBRnfu115sbpqr/7X180l5RgVk/fMKOYGcI7ihTcCuvNsE0WwAAAp/JbFL7Sxrq+n92UUITRun5U+vuSbJYSMOg4vhTBADwuMiYCI168x5NX/2sulzW0evtfT2rtmw2rzdTiZhkinrQ6604s/nF6cMVVqtV48aNU+PGjRUeHq6UlBRNnDhRhmHYywwbNkwmk8nhSEtLs9/fs2ePbr/9doc6xo8fr+LiYocyf63DZDLpt99+q/iXAwBAgKlRN0JXje6oASPaKYbNMXwuPDpY7S6u7+8wUEUE+TsAAEDV1bR9Y02a+6ge6j1e65ds9lo7B3aF6rvZNTXg5mNea6NSibhRptAL/B2F26ZMmaLp06dr1qxZSk1N1cqVK3XrrbcqNjZW999/v71cWlqaZsyYYT8PDQ21//OWLVtks9n0+uuvq2nTptq4caPuvPNOnTx5Us8//7xDez/88INSU1Pt57Vq+W6aOAAAvmQym9SoXW3VbRKj2eN/U9HJUn+HVG30vLGFwqNC/B0GqgiSeQAAr3t45n26q91Dys8t8Fobb/4rSZ165SqhQYnX2qgULPVlihrjk6acnULr6jTbZcuW6YorrtCAAQMkSY0aNdIHH3ygFStWOJQLDQ1VQkJCmXWkpaU5jNRr0qSJtm7dqunTp5+RzKtVq1a59QAAUBWFR4Wo140tNe/Njf4OpVpoen4dpXSo4+8wUIUwzRYA4HV1k+N1x5S/e7WNgpMWvTS6gVfbCHxmmWImy2SO8Elrrq6Zl5OT43AUFRWVWW+3bt20cOFCbdu2TZK0bt06LV26VP3793cot3jxYtWpU0ctWrTQ8OHDlZWVddZ4s7OzVbNmzTOuDxo0SHXq1FH37t311VdfufNVAABQ6TTtVEcpHUkweVtETIguGtLc32GgiiGZBwDwicvvvlQdL2nr1TbW/Bytr2ZU3ymSppiJAT29tkGDBoqNjbUfkydPLrPc2LFjNWTIELVs2VLBwcHq0KGDRo4cqZtuusleJi0tTe+++64WLlyoKVOmaMmSJerfv7+sVmuZde7YsUPTpk3T3Xffbb8WFRWlF154QZ988om++eYbde/eXYMHDyahBwCoNnre2Fzh0cH+DqPKCgkP0uX/OI/ptfA4ptkCAHzCZDLpnx+N0p1tRynr4HGvtTP9iXqKTypR1345XmsjEJmiH5cp4lqftunqNNt9+/YpJibGfv3Pa9z92ccff6zZs2fr/fffV2pqqtauXauRI0cqKSlJQ4cOlSQNGTLEXr5t27Zq166dUlJStHjxYvXp47iT8oEDB5SWlqZrr71Wd955p/167dq1NWrUKPt5586ddfDgQT333HMaNGiQE98AAACVW3hUiC6/7zx9OmWlDDYT86iQMIsuv+88xTeI9ncoqIIYmQcA8JnouCil3XaxV9uwWU2adHeyViysLh0ns0wxk2SKHOrzll2dZhsTE+NwlJfMGzNmjH10Xtu2bXXzzTfrwQcfLHckn3RqTbzatWtrx44dDtcPHjyo3r17q1u3bnrjjTfO+ZkuuOCCM+oAAKAqq5McozoNq0u/yTfCooI1eFRHJabE+jsUVFEk8wAAPhUaXnYCx5NKis168tbGWjSnhtfb+qujh8OUfcz7n1GSZGksU83ZPh+R5235+fkymx27KBaLRTZb+UMG9u/fr6ysLCUmJtqvHThwQL169VKnTp00Y8aMM+osy9q1ax3qAACgOggOq9qT9ixBrm3GVRENU2vp+sc7K54EKbyoav+NBQAEnKAQ3/z0WEtNemZEstb8HKW7xx9UZIy3546YpIhbFH/eKMkokZEzSSr8wkttmaWIoTJFPyiTKcxLbZybIcmmc3eODRfrHThwoCZNmqSGDRsqNTVVa9as0dSpU3XbbbdJkvLy8jRhwgRdffXVSkhI0M6dO/Xwww+radOm6tevn6T/JfKSk5P1/PPPKzMz017/6Z1rZ82apZCQEHXo0EGS9Pnnn+udd97RW2+95WLEAABUbpagqjnOJyTMoguvaabW3ZN07NBJ/fjuHzqy2ztLsYSEB6n7tU3VqluSV+oH/oxkHgDAp8KjfJt8mvdBLa1eEq2Rz+3X+b1zvdOIJVmm2KdlCul86twULlONKTKKBsjImy6VrPJQQyYppLtMUffJFNLBQ3W6z9U185w1bdo0jRs3TiNGjFBGRoaSkpJ0991364knnpB0apTe+vXrNWvWLJ04cUJJSUnq27evJk6caJ+6u2DBAu3YsUM7duxQ/fr1Heo3jP+lFydOnKi9e/cqKChILVu21EcffaRrrrnGpXgBAKjsgkMt/g7B4xq0rqnef2+p6Jqn+p41EyN11ZhO2rhkv9b9uF85mQUeaScoxKzmneuq8+VNFBXno9kZqPZMxp97tD6Sk5Oj2NhYZWdnOyyEDQCo+jb8/IdG9XzCL233vOK4rrzjqFp1yvdMheZ6MkVcL0UOlckUXm4xo2SrjPz3pcKvJOOk6+2YakjhV8sUcYNMQQ3dj9dF5f1en75+8Tf3KCjy3J3W0pNF+nHAa/zuBzD6ZgBQva2Yu1u/z93t7zA8om7jGLW7uL6ad04ot4xhGErffEwblxzQ3g1H5U5WpEbdCLW5qJ5adk1QaITvdgTmNxsSI/MAAD7WtEMjmc0m2Ww+f5ekJV/GacmXcUppk6+BQ7PU+8oTCotwdfrtf0fHRdwkhfaSyXTuaSmm4BYyxU6QET1GKlkrlWyUUbJJKt0kWfefWb+lkRScKlNwqhTURgppL5Mp8N70emtkHgAA8K3KvgFGUIhZzTrXVdue9Z1aq85kMik5tZaSU2vp5IkiHd6VrYz0XGWm5ypzb64KT5Y4lDcHmVQrKUrxydGq0zBadZJjVLtBlEwm+jjwD5J5AACfCo8KV73mSdq35YDfYti5MUIvjYnQv8fVU0pqgZq1K1Czdvlq2rZAtRJKFBJqyGQ2VFJoVm62RSFR7RTfqIcU3PZUks0c51a7JnOUFNpdCu1uX2nOMEoko0hSsaQQyRQmk6ly/DyTzAMAoGqIT658ybxm59dRQkrsqcRa/SgFhbg3VTiyRqhSOtZRSsc69mvWEptKS6wybJIlxKygILNMZvozCByV478WAABVSvPzm/g1mXdacaFZf6yK1B+rIs9a7t2dL8oUVdcrMZhMwZLJd1MzPIlkHgAAVUNkbKgiY0N0MrvY36E4JbpWmPre0cZr9VuCzbIEV81NQVA18KcTAOBzXQd29ncITmvUpoESG3snkQcAABAoGp0X7+8QnNaoXW1/hwD4Fck8AIDPXTi4s2omujdV1dcGDe/n7xAClmGYnD4AAEBga3NRPX+H4LS2PStPrIA3kMwDAPhcUHCQ+t9+sb/DOKeI6HBdcvNF/g4jYNlkcvoAAACBrXb9KCWmxPo7jHOq1yJOcQlnXyIFqOpI5gEA/OLyuy+VJci9hYp9pc/fL1J4VLi/wwAAAPCJNpVgxBuj8gCSeQAAP6ldr5Yuv/tSf4dRrojocA155Ap/hxHQTm+A4cwBAAACX0qnOqpVP8rfYZSrTnK0GrevPGv7Ad5CMg8A4De3P3OTEhrX8XcYZbrz2ZtVpyGdxbNhzTwAAKoWi8WsPkNbyWwOvN9uc5BJFwdobICvkcwDAPhNeGSYRr89QiZTYHXKOl7SNqBHDQIAAHhLfINodeyf7O8wztB5QGPVSgrcUYOAL5HMAwD41Xm9UjVoRODsGBsRE66H3hru7zAqBabZAgBQNZ1/WSPVbhA4ibM6ydHq2C/wEoyAv5DMAwD43T1Th+qCAR39HYZCwoL1rzmPML3WSUyzBQCgarJYzBowop2ia4b5OxTFxIfrshHtmF4L/AnJPACA3wUFB2ncx6PU8ZK2foshODRYT3zykM7rleq3GAAAAAJFVFyYBo1sr6i4UL/FEF0zTFc80F6Rsf6LAQhEJPMAAAEhNDxUE79+VN2u6OzztsOjwjTpm0d1wYBOPm+7MjOcnGLLyDwAACqnGnUidOXojoqtE+7ztuMSInTVmI6Kqe37toFARzIPABAwQkKDNf6z0brr2ZsVEhbskzZbd22uV39/Rh0u9t+owMrKkGQYThz+DhQAALgtpla4rh17vlr+LcFnbbbqlqirHzlfUXH+n+YLBKIgfwcAAMCfmc1mXTt6kC64vJOev+1V/fHbdq+0ExIWrGETb9DVDw6Q2cy7LXfYZJJJ5x51Z3OiDAAACFyhEcHqM6y1UjrW0eLZW3Qyu9gr7UTFharX31sqObWWV+oHqgqSeQCAgNSwZT29+PNEffXqPH3+8jc6vDvDI/VagizqNrizbp04RA1a1PNInQAAANVBo3a1dUPTC/T73D3649dDKi4o9Ui9oRFBatktUZ0HNFZoOGkK4Fz4WwIACFgWi0VX3n+ZrrgvTb9/t0ZfTZ+nld+vlc3m+sTN2vVq6rI7LlH/O/uodlJNL0Rb/Ti7Uy1r5gEAUHWERgSr+3XNdMEVTbRtxWFtWHJAWfvz3KorvmG02lxUT8261FVwiMXDkQJVF8k8AEDAM5vNumBAJ10woJMy9h3Vpl+2avuqndq+epe2r96tk9n5DuVNJpPqN09Us05N1LxTipp1aqLUbi1kCaKT6Ek2wySTE4k6G8k8AACqnOBQi1J71FNqj3o6uj9PR3ZnKyM9V5l7c5V1ME+2UseXr5Ygs2rVi1R8cozqNIxW3cYxqlUvyk/RA5UbyTwAQKVSp0Ft1RlSW72HXChJMgxDJ7PzVVxYLJvVpuDQYIVHhSkkLMTPkQIAAFQPtetHqXb9KKX+99xmtam40CpriU2SZAk2KyTMIrOFdYoBTyCZBwCo1Ewmk6JqREqK9Hco1c7p3WqdKQcAAKoPs8WssEgSd4C3kMwDAABuYc08AAAAwPdIlQMAAAAAAACVBCPzAACAWxiZBwAAAPgeyTwAAOAWdrMFAAAAfI9ptgAAAAAAAEAlQTIPAAC45fRuts4cAAAAgDeYTCbNmTPH32H4FNNsAQCAW04l6pxZM88HwQAAAKBaOnTokOLi4vwdhk+RzAMAAG5hAwwAAICqx2oztGL3MWXkFqpOdJi6NK4pizlw+3MJCQn+DsHnmGYLAAAAAAAAfb/xkLpP+VE3vPmbHvhwrW548zd1n/Kjvt94yLvtfv+9unfvrho1aqhWrVq6/PLLtXPnTklScXGx7rvvPiUmJiosLEzJycmaPHmy/dm/TrN95JFH1Lx5c0VERKhJkyYaN26cSkpK7PeffPJJtW/fXu+9954aNWqk2NhYDRkyRLm5uV79jJ5EMg8AALjFcOEAAABAYPt+4yEN/89qHcoudLh+OLtQw/+z2qsJvZMnT2rUqFFauXKlFi5cKLPZrCuvvFI2m03/93//p6+++koff/yxtm7dqtmzZ6tRo0bl1hUdHa2ZM2dq8+bNevnll/Xmm2/qxRdfdCizc+dOzZkzR3PnztXcuXO1ZMkSPfPMM177fJ7GNFsAAOAWptkCAABUDVaboQlfby7zJawhySRpwtebdWnrBK9Mub366qsdzt955x3Fx8dr8+bNSk9PV7NmzdS9e3eZTCYlJyefta5//vOf9n9u1KiRRo8erQ8//FAPP/yw/brNZtPMmTMVHR0tSbr55pu1cOFCTZo0yYOfynsYmQcAAAAAAFCNrdh97IwReX9mSDqUXagVu495pf3t27frhhtuUJMmTRQTE2MfeZeenq5hw4Zp7dq1atGihe6//37Nnz//rHV99NFHuvDCC5WQkKCoqCj985//VHp6ukOZRo0a2RN5kpSYmKiMjAyPfy5vIZkHAADcwzxbAACAKiEjt/xEnjvlXDVw4EAdO3ZMb775ppYvX67ly5dLOrVeXseOHbV7925NnDhRBQUFuu6663TNNdeUWc+vv/6qm266SZdddpnmzp2rNWvW6PHHH1dxcbFDueDgYIdzk8kkm83mlc/mDUyzBQAA7nFymq2YZgsAABDQ6kSHebScK7KysrR161a9+eab6tGjhyRp6dKlDmViYmJ0/fXX6/rrr9c111yjtLQ0HTt2TDVr1nQot2zZMiUnJ+vxxx+3X9u7d6/HY/Y3knkAAAAAAADVWJfGNZUYG6bD2YVlTqowSUqIDVOXxjXLuFsxcXFxqlWrlt544w0lJiYqPT1dY8eOtd+fOnWqEhMT1aFDB5nNZn3yySdKSEhQjRo1zqirWbNmSk9P14cffqjOnTvrm2++0RdffOHxmP2NabYAAMAthuH8AQAAgMBlMZs0fmBrSacSd392+nz8wNZe2fzCbDbrww8/1KpVq9SmTRs9+OCDeu655+z3o6Oj9eyzz+r8889X586dtWfPHn377bcym89MaQ0aNEgPPvig7rvvPrVv317Lli3TuHHjPB6zv5kMw/dd7JycHMXGxio7O1sxMTG+bh4AADihvN/r09cbvfNPmSPOPdXCll+oPbc9xe9+AKNvBgBA5eDt3+zvNx7ShK83O2yGkRgbpvEDWyutTaLH24N7mGYLAAAAAAAApbVJ1KWtE7Ri9zFl5BaqTvSpqbXeGJEH95HMAwAA7jFMzm1uwQYYAAAAlYbFbFLXlFr+DgNnQTIPAAC4xdn18FgzDwAAAPAcknkAAMA9xn8PZ8oBAAAA8Ah2swUAAAAAAAAqCUbmAQAAtxiGSYYT6+E5UwYAAACAc0jmAQAA9zGFFgAAAPApptkCAAAAAAAAlQQj8wAAgFuYZgsAAAD4HiPzAACAewwXDgAAAKAcvXr10siRI8u936hRI7300ks+ay/QkcwDAAAAAAAAKgmm2QIAADeZ/ns4Uw4AAACVgs0q7V0m5R2RoupKyd0ks8XfUeFPGJkHAADcwzRbAACAqmXzV9JLbaRZl0uf3X7qf19qc+q6l5WWluq+++5TbGysateurXHjxskwyu5ITp06VW3btlVkZKQaNGigESNGKC8vz6HML7/8ol69eikiIkJxcXHq16+fjh8/XmZ933zzjWJjYzV79myPfy5vIJkHAAAAAABQ3W3+Svr4FinnoOP1nEOnrns5oTdr1iwFBQVpxYoVevnllzV16lS99dZbZZY1m836v//7P23atEmzZs3Sjz/+qIcffth+f+3aterTp49at26tX3/9VUuXLtXAgQNltVrPqOv999/XDTfcoNmzZ+umm27y2ufzJKbZAgAA9zg76o6ReQAAAIHNZpW+f0Rld9wMSSbp+7FSywFem3LboEEDvfjiizKZTGrRooU2bNigF198UXfeeecZZf+8eUWjRo301FNP6Z577tG///1vSdKzzz6r888/334uSampqWfU8+qrr+rxxx/X119/rZ49e3r+Q3kJyTwAAOAew3TqcKYcAAAAAtfeZWeOyHNgSDkHTpVr3MMrIfztb3+TyfS/fmPXrl31wgsvlDma7ocfftDkyZO1ZcsW5eTkqLS0VIWFhcrPz1dERITWrl2ra6+99qztffrpp8rIyNAvv/yizp07e/zzeBPTbAEAgFsMw/kDAAAAASzviGfLedGePXt0+eWXq127dvrss8+0atUqvfrqq5Kk4uJiSVJ4ePg56+nQoYPi4+P1zjvvlLs2X6ByOZlXUFCgpUuXavPmzWfcKyws1LvvvuuRwAAAAOAc+mcAAKBCoup6tpwbli9f7nD+22+/qVmzZrJYHKf1rlq1SjabTS+88IL+9re/qXnz5jp40HFUYbt27bRw4cKztpeSkqJFixbpyy+/1D/+8Q/PfAgfcSmZt23bNrVq1UoXXXSR2rZtq549e+rQoUP2+9nZ2br11ls9HiQAAAhA7GYbEOifAQCACkvuJsUkSSpveRSTFFPvVDkvSU9P16hRo7R161Z98MEHmjZtmh544IEzyjVt2lQlJSWaNm2adu3apffee0+vvfaaQ5lHH31Uv//+u0aMGKH169dry5Ytmj59uo4ePepQrnnz5lq0aJE+++wzh3X4Ap1LybxHHnlEbdq0UUZGhrZu3aro6GhdeOGFSk9P91Z8AAAgUJ1eM8+ZA15D/wwAAFSY2SKlTfnvyV/7bv89T3vGa5tfSNItt9yigoICdenSRffee68eeOAB3XXXXWeUO++88zR16lRNmTJFbdq00ezZszV58mSHMs2bN9f8+fO1bt06denSRV27dtWXX36poKAzt45o0aKFfvzxR33wwQd66KGHvPb5PMlkuDAxuG7duvrhhx/Utm1bSZJhGBoxYoS+/fZbLVq0SJGRkUpKSipzccI/y8nJUWxsrLKzsxUTE1OxTwAAALyivN/r09fr/9+/ZA4PO2c9toJC7b//CX73vcQT/TP6ZgAAVA5e/83e/NWpXW3/vBlGTL1TibzWgzzfHtzi0m62BQUFDllMk8mk6dOn67777lPPnj31/vvvezxAAAAQmEzGqcOZcvAe+mcAAMBjWg+SWg44tWtt3pFTa+Qld/PqiDy4zqVkXsuWLbVy5Uq1atXK4forr7wiSRo0iCwtAADVhrPr4ZHM8yr6ZwAAwKPMFqlxD39HgbNwac28K6+8Uh988EGZ91555RXdcMMNlW47XwAAgMqM/hkAAED14tKaeZ7CuiwAAAS+c62Z1+DFiU6vmbfvwXH87gcw+mYAAFQO/GZDcnFkniTt2bNHb775pl599VVt3LjRGzEBAIDKwHDhcIHVatW4cePUuHFjhYeHKyUlRRMnTnQYXTZs2DCZTCaHIy0tzX5/z549uv322x3qGD9+vIqLix3aWr9+vXr06KGwsDA1aNBAzz77rBtfhP/RPwMAAKg+XFozb9GiRbr88stVUFBw6uGgIL3zzjv6+9//7pXgAABA9TNlyhRNnz5ds2bNUmpqqlauXKlbb71VsbGxuv/+++3l0tLSNGPGDPt5aGio/Z+3bNkim82m119/XU2bNtXGjRt155136uTJk3r++eclnXqz3bdvX11yySV67bXXtGHDBt12222qUaOG7rrrLt994AqifwYAAFC9uDQyb9y4cbr00kt14MABZWVl6c4779TDDz/srdgAAEAg89LIvGXLlumKK67QgAED1KhRI11zzTXq27evVqxY4VAuNDRUCQkJ9iMuLs5+73Sir2/fvmrSpIkGDRqk0aNH6/PPP7eXmT17toqLi/XOO+8oNTVVQ4YM0f3336+pU6e68WX4D/0zAACA6sWlZN7GjRv19NNPKzExUXFxcXruueeUkZGhrKwsb8UHAAAClYvJvJycHIejqKiozGq7deumhQsXatu2bZKkdevWaenSperfv79DucWLF6tOnTpq0aKFhg8ffs7+SHZ2tmrWrGk///XXX3XRRRcpJCTEfq1fv37aunWrjh8/7vz34Gf0zwAAAKoXl5J5OTk5ql27tv08IiJC4eHhys7O9nhgAAAgwBkm5w9JDRo0UGxsrP2YPHlymdWOHTtWQ4YMUcuWLRUcHKwOHTpo5MiRuummm+xl0tLS9O6772rhwoWaMmWKlixZov79+8tqtZZZ544dOzRt2jTdfffd9muHDx9W3bp1HcqdPj98+HCFvhpfon8GAABQvbi0Zp4kzZs3T7GxsfZzm82mhQsXOiy2PGjQIM9EBwAAqox9+/Y57Lr25zXu/uzjjz/W7Nmz9f777ys1NVVr167VyJEjlZSUpKFDh0qShgwZYi/ftm1btWvXTikpKVq8eLH69OnjUN+BAweUlpama6+9VnfeeacXPpn/0T8DAACVWa9evdS+fXu99NJL/g5FkjRz5kyNHDlSJ06ckCQ9+eSTmjNnjtauXevXuE5zOZl3uhP9Z39+y20ymcp9Kw4AAKoOk3HqcKacJMXExDgk88ozZswY++g86VSybu/evZo8eXKZ/RBJatKkiWrXrq0dO3Y4JPMOHjyo3r17q1u3bnrjjTccnklISNCRI0ccrp0+T0hIOPcHCyD0zwAAADzn+uuv12WXXWY/Hz16tP7xj3/4MSJHLiXzbDabt+IAAACVjbObW7i4AUZ+fr7MZseVQCwWy1n7Ifv371dWVpYSExPt1w4cOKDevXurU6dOmjFjxhl1du3aVY8//rhKSkoUHBwsSVqwYIFatGjhsJlGoKN/BgAAPMlqs2p1xmpl5mcqPiJeHet0lMVs8XdYPhUeHq7w8HD7eVRUlKKiovwYkSOX1sw7F5vNprlz53qySgAAUM0MHDhQkyZN0jfffKM9e/boiy++0NSpU3XllVdKkvLy8jRmzBj99ttv2rNnjxYuXKgrrrhCTZs2Vb9+/SSdSuT16tVLDRs21PPPP6/MzEwdPnzYYS28G2+8USEhIbr99tu1adMmffTRR3r55Zc1atQov3xub6F/BgAAnPXD3h/U77N+um3ebXrk50d027zb1O+zfvph7w9eb9tms+nhhx9WzZo1lZCQoCeffNJ+b+rUqWrbtq0iIyPVoEEDjRgxQnl5eZIkwzAUHx+vTz/91F6+ffv2Di95ly5dqtDQUOXn55+zPunUNNsaNWrYz5988km1b9/efv7777/r0ksvVe3atRUbG6uePXtq9erVDp/HZDLprbfe0pVXXqmIiAg1a9ZMX331lSe+Ks8k83bs2KHHHntM9evXt3e0AQAA3DFt2jRdc801GjFihFq1aqXRo0fr7rvv1sSJEyWdGqW3fv16DRo0SM2bN9ftt9+uTp066eeff7avw7dgwQLt2LFDCxcuVP369ZWYmGg/TouNjdX8+fO1e/duderUSQ899JCeeOIJ3XXXXX753J5G/wwAALjih70/aNTiUTqS77gMSUZ+hkYtHuX1hN6sWbMUGRmp5cuX69lnn9W//vUvLViwQJJkNpv1f//3f9q0aZNmzZqlH3/8UQ8//LCkU0mziy66SIsXL5YkHT9+XH/88YcKCgq0ZcsWSdKSJUvUuXNnRUREnLM+Z+Tm5mro0KFaunSpfvvtNzVr1kyXXXaZcnNzHcpNmDBB1113ndavX6/LLrtMN910k44dO1bRr0omwzBcnPxySkFBgT755BO99dZb+uWXX9SjRw8NGTJEV1555Rk7w/1VTk6OYmNjlZ2d7dTaOQAAwPfK+70+fT15ylMyh4Wdsx5bYaH2PvJPfvd9wN3+GX0zAAAqB2/9ZlttVvX7rN8ZibzTTDKpbkRdfX/1916ZcturVy9ZrVb9/PPP9mtdunTRxRdfrGeeeeaM8p9++qnuueceHT16VNKpl8Gvv/66Nm7cqC+//FKTJ09WQkKC0tLSdM899+jSSy9Vly5dNGnSpDLb/2t9rm6AYbPZVKNGDb3//vu6/PLLJZ1KMv7zn/+0v5A+efKkoqKi9N133yktLc2t7+k0l0fm/f7777r77ruVkJCgl156SVdccYVMJpP+/e9/65577jlnIg8AAACeRf8MAABUxOqM1eUm8iTJkKHD+Ye1OmN1uWUqql27dg7niYmJysjIkCT98MMP6tOnj+rVq6fo6GjdfPPNysrKsk+b7dmzpzZv3qzMzEwtWbJEvXr1Uq9evbR48WKVlJRo2bJl6tWrl73uc9V3LkeOHNGdd96pZs2aKTY2VjExMcrLy1N6enq5nykyMlIxMTH2z1QRLiXz2rVrp2uvvVa1atXSsmXLtHr1aj300EMymUwVDgQAAFQyhsn5A15D/wwAAFRUZn6mR8u54/SGZKeZTCbZbDbt2bNHl19+udq1a6fPPvtMq1at0quvvipJKi4uliS1bdtWNWvW1JIlSxySeUuWLNHvv/+ukpISdevWTZKcqu9chg4dqrVr1+rll1/WsmXLtHbtWtWqVeuM58v7TBXl0m62W7du1fXXX6/evXurdevWFW4cAABUYl7azRauoX8GAAAqKj4i3qPlPGnVqlWy2Wx64YUXZDafGpP28ccfO5QxmUzq0aOHvvzyS23atEndu3dXRESEioqK9Prrr+v8889XZGSk0/Wdyy+//KJ///vfuuyyyyRJ+/bts0/R9QWXRubt2rVLLVq00PDhw1W/fn2NHj1aa9as4c0vAACAn9A/AwAAFdWxTkfVjagrk8ruP5hkUkJEgjrW6ejjyKSmTZuqpKRE06ZN065du/Tee+/ptddeO6Ncr1699MEHH6h9+/aKioqS2WzWRRddpNmzZ6tnz54u13c2zZo103vvvac//vhDy5cv10033aTw8PAKf1ZnuZTMq1evnh5//HHt2LFD7733ng4fPqwLL7xQpaWlmjlzprZt2+atOAEAQKAxXDjgNfTPAABARVnMFo3tMlaSzkjonT5/pMsjXtn84lzOO+88TZ06VVOmTFGbNm00e/ZsTZ48+YxyPXv2lNVqdVgb7/TGGn++5mx9Z/P222/r+PHj6tixo26++Wbdf//9qlOnjrsf0WVu72Z7WnZ2tmbPnq133nlHq1evVps2bbR+/fqzPsOOaQAABL5z7WbbaNIkp3ez3fP44/zu+5Cr/TP6ZgAAVA7e/s3+Ye8PembFMw6bYSREJOiRLo/okuRLPN4e3OPSmnlliY2N1YgRIzRixAitXbtW77zzjifiAgAAgY418wIW/TMAAOCOS5IvUe8GvbU6Y7Uy8zMVHxGvjnU6+mVEHspX4WTen7Vv317/93//58kqAQAAUAH0zwAAgCssZos6J3T2dxg4C5eSeRdffPE5y5hMJi1cuNDtgAAAQCXByLyAQP8MAACgenEpmbd48WIlJydrwIABCg4O9lZMAACgEjAZpw5nysF76J8BAABULy4l86ZMmaIZM2bok08+0U033aTbbrtNbdq08VZsAAAAOAf6ZwAAANWL2ZXCY8aM0ebNmzVnzhzl5ubqwgsvVJcuXfTaa68pJyfHWzECAIBAZJicP+A19M8AAACqF5eSead17dpVb775pg4dOqR7771X77zzjpKSkugwAgBQnRguHPA6+mcAAADVg1vJvNNWr16tJUuW6I8//lCbNm1YpwUAAMDP6J8BAABUbS4n8w4ePKinn35azZs31zXXXKOaNWtq+fLl+u233xQeHu6NGAEAQAA6vQGGMwe8i/4ZAABA9eHSBhiXXXaZFi1apL59++q5557TgAEDFBTkUhUAAKCqcHYKLck8r6J/BgAAqqNevXqpffv2eumll/wdis+51NP7/vvvlZiYqPT0dE2YMEETJkwos9zq1as9EhwAAADOjv4ZAACoyhYvXqzevXvr+PHjqlGjhr/DCQguJfOeeOIJmUzsSAcAACQ5O4WWkXleRf8MAAB4kmG1Kn/lKpVmZiooPl4R53eSyWLxd1g+UVxcrJCQEH+HcU4uJfOefPJJL4UBAAAqHabZBgT6ZwAAwFNy5s/Xkacnq/TwYfu1oIQE1X3sUcX07eu1douKijRmzBh9+OGHysnJ0fnnn68XX3xR8fHx6t27tyQpLi5OkjR06FDNnDlTkmSz2fTwww/rrbfeUkhIiO655x6HvtGJEyc0evRoffnllyoqKrLXe95550k61Y+aM2eO7rvvPk2aNEl79+6VzWbz2uf0FJc2wIiLi1PNmjXPOBo3bqx+/fppwYIF3ooTAAAEGsOFA15D/wwAAHhCzvz5OvDASIdEniSVHjmiAw+MVM78+V5r++GHH9Znn32mWbNmafXq1WratKn69eun6OhoffbZZ5KkrVu36tChQ3r55Zftz82aNUuRkZFavny5nn32Wf3rX/9y6Ptce+21ysjI0HfffadVq1apY8eO6tOnj44dO2Yvs2PHDn322Wf6/PPPtXbtWq99Rk9yaWReeYsKnjhxQqtWrdLll1+uTz/9VAMHDvREbAAAADgH+mcAAKCiDKtVR56eLBllvIU1DMlk0pGnJyu6Tx+PT7k9efKkpk+frpkzZ6p///6SpDfffFMLFizQO++8o86dO0uS6tSpc8aaee3atdP48eMlSc2aNdMrr7yihQsX6tJLL9XSpUu1YsUKZWRkKDQ0VJL0/PPPa86cOfr000911113STo1tfbdd99VfHy8Rz+XN7mUzBs6dOhZ77dv316TJ0+mswgAQDVgcnLNPKfW1YPb6J8BAICKyl+56owReQ4MQ6WHDyt/5SpFXtDFo23v3LlTJSUluvDCC+3XgoOD1aVLF/3xxx/2ZF5Z2rVr53CemJiojIwMSdK6deuUl5enWrVqOZQpKCjQzp077efJycmVKpEnuZjMO5fLL79cTz31lCerBAAAQAXQPwMAAOdSmpnp0XK+Ehwc7HBuMpnsa97l5eUpMTFRixcvPuO5P4/wi4yM9GaIXuHRZF5RUVGl2PUDAACguqB/BgAAziXIyZFpzpZzRUpKikJCQvTLL78oOTlZklRSUqLff/9dI0eOtPdjrFarS/V27NhRhw8fVlBQkBo1auTpsP3KpQ0wzuXtt99W+/btPVklAAAIVGyAUSnQPwMAAOcScX4nBSUkSCZT2QVMJgUlJCji/E4ebzsyMlLDhw/XmDFj9P3332vz5s268847lZ+fr9tvv13JyckymUyaO3euMjMzlZeX51S9l1xyibp27arBgwdr/vz52rNnj5YtW6bHH39cK1eu9Pjn8CWXRuaNGjWqzOvZ2dlavXq1tm3bpp9++skjgQEAgMDGmnmBgf4ZAACoKJPForqPPaoDD4w8ldD780YY/03w1X3sUY9vfnHaM888I5vNpptvvlm5ubk6//zzNW/ePMXFxSkuLk4TJkzQ2LFjdeutt+qWW27RzJkzz/2ZTCZ9++23evzxx3XrrbcqMzNTCQkJuuiii1S3bl2vfA5fMRlGWVuVlK13795lXo+JiVGLFi00fPhwNW7c+Jz15OTkKDY2VtnZ2YqJiXE+WgAA4DPl/V6fvt507NOyhIWdsx5rYaF2PPMYv/te4on+GX0zAAAqB2//ZufMn68jT0922AwjKCFBdR97VDF9+3q8PbjHpZF5ixYt8lYcAACgMmLUnd/RPwMAAJ4S07evovv0ObW7bWamguLjFXF+J6+NyIN7PLoBBgAAqEacXQ+PhB8AAEClYbJYFHlBF3+HgbPw6AYYAAAAAAAAALyHkXkAAMAtbIABAAAA+B7JPAAA4B6m2QIAAAA+RzIPAAC4hZF5AAAAgO+xZh4AAAAAAABQSTAyDwAAuIdptgAAAIDPkcwDAADuIZkHAAAA+BzTbAEAAAAAABDwZs6cqRo1apy1zLBhwzR48GCfxOMvjMwDAABuYQMMAAAABJqXX35ZhvG/DmivXr3Uvn17vfTSS/4LysNI5gEAAPcwzRYAAKDKsdkMHdp+QidzihQZE6rEZjVkNpv8HZbTYmNj/R2C1zHNFgAAAAAAANq5JkPvPrZMc15cowVvb9acF9fo3ceWaeeaDK+1OXfuXNWoUUNWq1WStHbtWplMJo0dO9Ze5o477tDf//53+/m8efPUqlUrRUVFKS0tTYcOHbLf+/M022HDhmnJkiV6+eWXZTKZZDKZtGfPHknSxo0b1b9/f0VFRalu3bq6+eabdfToUa99Tk8imQcAANxjuHAAAAAgoO1ck6HvX9+okyeKHK6fPFGk71/f6LWEXo8ePZSbm6s1a9ZIkpYsWaLatWtr8eLF9jJLlixRr169JEn5+fl6/vnn9d577+mnn35Senq6Ro8eXWbdL7/8srp27ao777xThw4d0qFDh9SgQQOdOHFCF198sTp06KCVK1fq+++/15EjR3Tdddd55TN6Gsk8AADgltNr5jlzAAAAIHDZbIZ+/mj7Wcss/Xi7bDbPd+xiY2PVvn17e/Ju8eLFevDBB7VmzRrl5eXpwIED2rFjh3r27ClJKikp0Wuvvabzzz9fHTt21H333aeFCxeWW3dISIgiIiKUkJCghIQEWSwWvfLKK+rQoYOefvpptWzZUh06dNA777yjRYsWadu2bR7/jJ5GMg8AAAAAAKAaO7T9xBkj8v4q73iRDm0/4ZX2e/bsqcWLF8swDP3888+66qqr1KpVKy1dulRLlixRUlKSmjVrJkmKiIhQSkqK/dnExERlZLg2anDdunVatGiRoqKi7EfLli0lSTt37vTcB/MSNsAAAADuYQMMAACAKuFkztkTea6Wc1WvXr30zjvvaN26dQoODlbLli3Vq1cvLV68WMePH7ePypOk4OBgh2dNJpPD7rXOyMvL08CBAzVlypQz7iUmJrr3IXyIZB4AAHCLs1NomWYLAAAQ2CJjQj1azlWn18178cUX7Ym7Xr166ZlnntHx48f10EMPuV13SEiIfXON0zp27KjPPvtMjRo1UlBQ5UuNMc0WAAC4hw0wAAAAqoTEZjUUWePsibqouFAlNqvhlfbj4uLUrl07zZ49277RxUUXXaTVq1dr27ZtDiPzXNWoUSMtX75ce/bs0dGjR2Wz2XTvvffq2LFjuuGGG/T7779r586dmjdvnm699dYzEn+BiGQeAAAAAABANWY2m9Tj+mZnLdP9umYym01ei6Fnz56yWq32ZF7NmjXVunVrJSQkqEWLFm7XO3r0aFksFrVu3Vrx8fFKT09XUlKSfvnlF1mtVvXt21dt27bVyJEjVaNGDZnNgZ8qMxmuTiz2gJycHMXGxio7O1sxMTG+bh4AADihvN/r09dbjXhaltCwc9ZjLSrUH/9+jN/9AEbfDACAysHbv9k712To54+2O2yGERUXqu7XNVNKhzoebw/uqXwTgwEAQEAw/fdwphwAAAACX0qHOmp8Xvyp3W1zihQZc2pqrTdH5MF1JPMAAAAAAAAg6dSU23ot4vwdBs6CZB4AAHCPs5tbsAEGAAAA4DEk8wAAgFtMxqnDmXIAAAAAPCPwt+gAAAAAAAAAIImReQAAwF1MswUAAAB8jmQeAABwH4k6AAAAwKeYZgsAAAAAAABUEozMAwAAbmEDDAAAAMD3SOYBAAD3sGYeAAAA4HMk8wAAgFsYmQcAAAD4HmvmAQAAAAAAAJUEI/MAAIB7mGYLAAAA+BzJPAAA4Bam2QIAAAC+xzRbAAAAAAAAoJIgmQcAANxjuHC4wGq1aty4cWrcuLHCw8OVkpKiiRMnyjD+V9GwYcNkMpkcjrS0NId6Jk2apG7duikiIkI1atQos62/1mEymfThhx+6FjAAAADgQ0yzBQAA7vHSmnlTpkzR9OnTNWvWLKWmpmrlypW69dZbFRsbq/vvv99eLi0tTTNmzLCfh4aGOtRTXFysa6+9Vl27dtXbb79dbnszZsxwSASWl/gDAAAAAgHJPAAAEFCWLVumK664QgMGDJAkNWrUSB988IFWrFjhUC40NFQJCQnl1jNhwgRJ0syZM8/aXo0aNc5aDwAAABBImGYLAADccnoDDGcOScrJyXE4ioqKyqy3W7duWrhwobZt2yZJWrdunZYuXar+/fs7lFu8eLHq1KmjFi1aaPjw4crKynLrc9x7772qXbu2unTponfeecdhOi8AAAAQaBiZBwAA3OPiNNsGDRo4XB4/fryefPLJM4qPHTtWOTk5atmypSwWi6xWqyZNmqSbbrrJXiYtLU1XXXWVGjdurJ07d+qxxx5T//799euvv8pisTj9Ef71r3/p4osvVkREhObPn68RI0YoLy/PYTovAAAAEEhI5gEAAJ/Yt2+fYmJi7Od/XePutI8//lizZ8/W+++/r9TUVK1du1YjR45UUlKShg4dKkkaMmSIvXzbtm3Vrl07paSkaPHixerTp4/TMY0bN87+zx06dNDJkyf13HPPkcwDAABAwGKaLQAAcIvJMJw+JCkmJsbhKC+ZN2bMGI0dO1ZDhgxR27ZtdfPNN+vBBx/U5MmTy42lSZMmql27tnbs2FGhz3TBBRdo//795U4BBgAAAPyNkXkAAMA9XtrNNj8/X2az4/tGi8Uim81W7jP79+9XVlaWEhMTXWvsL9auXau4uLhyE40AAACAv5HMAwAAbvnz5hbnKueKgQMHatKkSWrYsKFSU1O1Zs0aTZ06VbfddpskKS8vTxMmTNDVV1+thIQE7dy5Uw8//LCaNm2qfv362etJT0/XsWPHlJ6eLqvVqrVr10qSmjZtqqioKH399dc6cuSI/va3vyksLEwLFizQ008/rdGjR7sWMAAAAOBDJPMAAEBAmTZtmsaNG6cRI0YoIyNDSUlJuvvuu/XEE09IOjVKb/369Zo1a5ZOnDihpKQk9e3bVxMnTnQYUffEE09o1qxZ9vMOHTpIkhYtWqRevXopODhYr776qh588EEZhqGmTZtq6tSpuvPOO337gQEAAAAXmAzDcPF9ecXl5OQoNjZW2dnZDgthAwCAwFHe7/Xp6x1unCRLSNg567EWF2rN+4/zux/A6JsBAFA58JsNiZF5AADATd6aZgsAAACgfOxmCwAAAAAAAFQSjMwDAADu8dJutgAAAADKRzIPAAC4hWm2AAAAgO8xzRYAAAAAAACoJBiZBwAA3MM0WwAAAMDnSOYBAAC3MYUWAAAA8C2SeQAAwD2GcepwphwAAAAAj2DNPAAAAAAAAKCSYGQeAABwC7vZAgAAAL5HMg8AALiHDTAAAAAAn2OaLQAAAAAAAFBJMDIPAAC4xWQ7dThTDgAAAIBnkMwDAADuYZotAAAA4HNMswUAAAAAAAAqCUbmAQAAt7CbLQAAAOB7JPMAAIB7DOPU4Uw5AAAAAB7BNFsAAAAAAACgkmBkHgAAcAvTbAEAAADfI5kHAADcw262AAAAgM+RzAMAAG5hZB4AAADge6yZBwAAAAAAAFQSjMwDAADuYTdbAAAAwOdI5gEAALcwzRYAAADwPabZAgAAAAAAAJUEI/MAAIB72M0WAAAA8DmSeQAAwC1MswUAAAB8j2m2AAAAAAAAQCXByDwAAOAem3HqcKYcAAAAAI8gmQcAANzDmnkAAACAzzHNFgAAAAAAAKgkGJkHAADcYpKTG2B4PRIAAACg+iCZBwAA3GMYpw5nygEAAADwCJJ5AADALSbDyZF55PIAAAAAj2HNPAAAAAAAAKCSYGQeAABwD7vZAgAAAD5HMs8DMtIztXfzfhWeLFJJcaksQRaFRYQoMSVB9ZsnymxmACQAoOoxGYZMTqyH50wZwJNsNkPHD59U7tFClZbYZLPaZAk2KzjUolpJUYqsEervEAEAANxGMs8Nfyzfrl+/+l3bV+/S9lW7lH00t9yyEdHhSmnfSM06NlHHS9upc1p7knsAAAAeZLPatGd9lvZvO67Mvbk6uj9XpcW2cstHxIQoPjla8Q2j1aR9vOIbRPswWgAAgIohmeekwvwi/fj+Us19bZ62r97t9HP5uQXa8PMf2vDzH/r85W+U0CheA+66VP3v6KPY2jFejBgAAC+z/fdwphzgBSezi7Tp54Pa/PMBncwudvq5/Jxi7d2Qpb0bsrTymz1KaBKjNj3rq2nHOrIE89IVAAAENpJ552C1WvXpC3P14TNfKO/EyQrXd3hPpt5+7H29O+ET9b/9Yt3xzE0Kjwr3QKQAAPgW02zhL4UnS/TLZzu07bfDstkq/ufr8K4cHd61WUs/2a4ulzdWm571ZDKZPBApAACA55HMO4u9f+zX87e+qi0rdni87pKiEn3173la/s1qjXpruDr2aevxNgAAAKqa3esytXj2VuXnOD8Sz1mFeSX66cNt2rk6Qxff0koxtXnhCgAAAg/zCMpgGIY+evZLDe/4sFcSeX92ZG+mxvadqJeHv6GigiKvtgUAgEcZLhxABRUXlGrBjE36dvoGryTy/uzAthP6YOIKbVyy36vtAAAAuIOReX9htVr10l2v6/sZi3zWpmEYmvv6Au3dvF8Tvx6ryJgIn7UNAIDbDOPU4Uw5oAIKcov19bR1ykwvf9MxTystsmrJB9t0IrNA3a9p5rN2AQAAzoWReX9is9n0zM3TfJrI+7MNP/+hhy/5l07m5PulfQAAgEBTkFusL15Y7dNE3p+t+2GfFr+/1S9tAwAAlIVk3p+8fM8bWvzhL36NYdvKnRo36Bmm3AIAAp7JcP4A3FFUUKqv/m+tjh/274vOTT8d0K9feHfpFQAAAGcxzfa/vn3zB3371kL7ec26JWrSqkCRMVYFhxqyWU0qLjIp82Cwdm0OV0mR9/KgG376Q2+MeU//eOUOr7UBAECFMc0WXrZk9hYd3Zfn7zAkSavnpatOoxildKjj71AAAEA1RzJPUsa+o/rmtdf094eOqHm7fDVtV6BadUvLLV9aIqVvC9O29RHauDxSP31dQ0UFnk3ufT19vi66pqvO65Xq0XoBAPAUk+3U4Uw5wFU712Ro+8oMf4fhYMkH21SvWZzCooL9HQoAAKjGqlwyLz+3QNtX71JG+lEVFxTLZrUpODRYETHhaty2oeo3T5LJZJIkGUapVLRAOTue1rRvjjjdRlCw1CS1UE1SC5V2wzHdM+GAFnxSU3Nn1dL+nWEe+RyGYej52/+tN9a/oPBIz9QJAADgDyezi5SZnquC3BJZS09ldy1BZoVHByu+YbQiY0PtZYvyS7R+0X79/s1uf4VbroKcYv304Vb1vaONv0MBAADVWKVP5uXnFmjRB0u1/qfN2r5ql/ZvOyTjLNN5ImLC1bRDY11280l177tUwUHH1KRVxWKIirXpyjuO6so7jmr5gmi98lh9ZRwIqVilkg7vztCMxz/QiJdurXBdAAB4HNNsUY7cY4Xa+tthHdmdrYz0XOVnF5+1fERsiOIbRKu4sFQZe3NkLQncPzPbV2aoWedMNT4v3t+hAACAaqrSJvN2b0zX1/+ep4Wzf1Z+boHTz4WF5eiqW79R1745Xonrgktz1eZvW/XWxCR9+59aFa7v2zd/0M3jr1V0XJQHogMAwIOM/x7OlEOVZxiG0jcf08YlB7R3Y5YMm/P/4vOzi7U3O8uL0XnW6nl7SeYBAAC/qXTJvMN7MvTy8De0ct46l5/tecVx/ePpA4qOs3ohsv+JjLbpgWf366KBJ/TcAw2Vddj9dVWKCoo1b8YiXTNqoAcjBAAA8Jz9W49ryftbdeKIf3ed9ZXDu3KUuS9X8Q2i/R0KAACohry3JauHGYahr/49T3e1e8itRN5192bosenpXk/k/VmHHnl68avtSmpcVKF65r6+4KxThwEA8AeTYTh9oGoqLizVkve36suX1lSbRN5pG5cc8HcIAACgmqoUybyjB7L08CUTNO2+t1SQV+jy8zc9eFi3P37IC5GdW936JXrhix2q18T9hN6B7Ye0ZuEGD0YFAIAHnF4zz5kDVc7BHSf04cQV2vjTgWo5lXrb70dUXFjq7zAAAEA1FPDJvP3bD+mBC/+ptYs2ufX84NszdcsY53eq9YaadUr1zEc7FV/v7Is/n82aHzd6MCIAAAD37Vqbqa9eWqvcLNdfslYVpUVWHdntnTWYAQAAziagk3n7tx/SQz2fUEb6Ubeeb33+Sd315EEPR+WeOvVK9Oi/98pkcu/V9fbVuzwcEQAAFWRIsjlxVMNRW1XZjlUZmvfGRllLbf4Oxe8y03P9HQIAAKiGAjaZl7k/S49c+i8dO3zCreeDQ20a9eI+WSyejasiUjvn66q7Mt16dvsqknkAgMDCmnnVT/rmLC2YsUk2F3aqrcoy9pLMAwAAvheQyTzDMDT5ppfdHpEnScMeOawGKRXbeMIbbnn4sFvr5+Vk5erwngwvRAQAAHBuJ7OLNP/tTbKVksg7LTOdabYAAMD3AjKZN2fad9rw8x9uP9/8vHxdead7I+C8LSzc0Kip6W49e2iXf9f+AwDAgSEnN8Dwd6DwhCXvb1XRSTZ8+LOco4UyGHkKAAB8LOCSeQd2HNI7j71foTquHZ4RUNNr/6pNl3y17nzS5eeKC9zfQAMAAI9jN9tqY+vyw9q9zv0ZE1WZtYS1AwEAgG8FXDLvpbtfV2G++9Nja9YpUde0wJ/yMHCo6x3i0hKrFyIBAMBNzmx+cfpApVV4skQ/f7zN32EELJuVZDUAAPCtgErmbV25U2sXbapQHf1vylJwSOB3qroPyFZsTdemqgSHBnspGgAAgLJtXnqQ6bVnYQkKqO40AACoBgKq9/H1v+dVsAZD/W865pFYvC0k1FDfIa7FGh4V5qVoAABwHbvZVn2GzdCmnw/4O4yAZTabZAkOqO40AACoBgKm95F7PE+LP/qlQnXUTylSfFKJhyLyvvYX5rlUPjm1vpciAQDADayZV+Xt3ZSlnKOF/g4jYMUlRfo7BAAAUA0FTDLvh/d+UlEFN3ho1q7AQ9H4RtN2+U6XTWgUr5ia0V6MBgCAwGC1WjVu3Dg1btxY4eHhSklJ0cSJEx12DR02bJhMJpPDkZaW5lDPpEmT1K1bN0VERKhGjRpltpWenq4BAwYoIiJCderU0ZgxY1RaypTS0zYvPejvEAJanYb0zQAAgO8F+TuA09Yt3ljhOpq5kBwLBDVqWRVfr1iZB0LOWbZZpyY+iAgAABc4O+rOxZF5U6ZM0fTp0zVr1iylpqZq5cqVuvXWWxUbG6v777/fXi4tLU0zZsywn4eGhjrUU1xcrGuvvVZdu3bV22+/fUY7VqtVAwYMUEJCgpYtW6ZDhw7plltuUXBwsJ5++mmXYq6KDMPQgW0n/B1GQIsnmQcAAPwgYJJ521btqnAdlW1kniQ1a1vgXDKvY4oPogEAwAVeSuYtW7ZMV1xxhQYMGCBJatSokT744AOtWLHCoVxoaKgSEhLKrWfChAmSpJkzZ5Z5f/78+dq8ebN++OEH1a1bV+3bt9fEiRP1yCOP6Mknn1RIyLl/n6uy7MwCFRcwSvFs4pNJ5gEAAN8LiGm2JzKzlbkvq8L1JDSs2DRdf0hMLnKqXNdB53s5EgAAvCsnJ8fhKCoq+zewW7duWrhwobZt2yZJWrdunZYuXar+/fs7lFu8eLHq1KmjFi1aaPjw4crKcq0v8euvv6pt27aqW7eu/Vq/fv2Uk5OjTZs2ufjpqp7M9Fx/hxDQImuEMs0WAAD4RUCMzNvugVF5khQSZvNIPb4UEnbu0QrterZWo9QGPogGAAAX2CSZnCwnqUEDx9+y8ePH68knnzyj+NixY5WTk6OWLVvKYrHIarVq0qRJuummm+xl0tLSdNVVV6lx48bauXOnHnvsMfXv31+//vqrLBaLU+EfPnzYIZEnyX5++PBhp+qoyjL3ksw7m9QeSTJbAuK9OAAAqGYCIpl3eE+mR+oJCqp8u+UFBZ875kHD+/kgEgAAXGMyDJmcmEJ7usy+ffsUExNjv/7XNe5O+/jjjzV79my9//77Sk1N1dq1azVy5EglJSVp6NChkqQhQ4bYy7dt21bt2rVTSkqKFi9erD59+lTkY+G/crLYxbY8ZotJrbsn+TsMAABQTQVEMq+4grvYnlZSbJb99X8lUVJ09iENNRPjdOGVXXwUDQAA3hMTE+OQzCvPmDFjNHbsWHvCrm3bttq7d68mT55sT+b9VZMmTVS7dm3t2LHD6WReQkLCGevwHTlyxH6vurOWWP0dQsBqfF68ImPLTkYDAAB4W0DMDbCWeqazWJAXEB/HJQX5Z4/55ieuVVBwQORcAQBwdHoDDGcOF+Tn58tsdvx9tFgsstnKf2G3f/9+ZWVlKTEx0el2unbtqg0bNigjI8N+bcGCBYqJiVHr1q1dirkqstkq34wHXzAHmdR5QCN/hwEAAKqxgMgSBYcGe6SevdvClNS4cm2CsWdLWLn3Ol7SVpfffakPowEAwAU2QzI5kfBxMSk0cOBATZo0SQ0bNlRqaqrWrFmjqVOn6rbbbpMk5eXlacKECbr66quVkJCgnTt36uGHH1bTpk3Vr9//lqZIT0/XsWPHlJ6eLqvVqrVr10qSmjZtqqioKPXt21etW7fWzTffrGeffVaHDx/WP//5T917773lTgGuTixBle8lqS90HtBYtepF+TsMAABQjQVEMi/IQ8m87evD1bVfjkfq8pUdGyLKvB4RHa5Rbw73cTQAALjA2VF3Lo7MmzZtmsaNG6cRI0YoIyNDSUlJuvvuu/XEE09IOjVKb/369Zo1a5ZOnDihpKQk9e3bVxMnTnRIwj3xxBOaNWuW/bxDhw6SpEWLFqlXr16yWCyaO3euhg8frq5duyoyMlJDhw7Vv/71L5firaoYmXemOsnR6tgv2d9hAACAai4gknkbftrkkXq2ry87MRaoDu4O0cmcsnfcu+u5m1U3Od7HEQEA4H/R0dF66aWX9NJLL5V5Pzw8XPPmzTtnPTNnztTMmTPPWiY5OVnffvutG1FWbTarTUf3sZvtn1mCzbr4llYym53ZwhkAAMB7/J7M2/jLFi35aJlH6tq+Ptwj9fjKtnKSj9eMGqgBdzG9FgAQ6JxdD48RXpXN6nnpOnmici1d4k1ms0n97khlei0AAAgIfk3mFRcW64Xb/+2xaRwdeuTJWipZ/J6idM6v887c0e/yuy/V3c/f4odoAABwkZem2cK/sg7m6fdvd/s7jIBhNpvUZ1grNT6PGRMAACAw+DXtteiDpdq/7VCF64mLL9EDz+6vVOvlHcsI0tJvYh2uXTfmCt055e9+iggAAEBa+e0e2UpJwEqnNgHpe0eqmrQnkQcAAAKHX5N53769sMJ1NGpZoKc/2KVadUs9EJHvzPugpkpLTu0SFx4VpofeHqGe13b1c1QAALjAZsipKbRspFBp5OcUa9eaTH+HERDiEiLU9442ql2fqbUAACCw+DWZt2fDPgWZ3N/Jtvl5+Xr6/V2KjrN6MCrvs5ZK3/ynlsO1Vn9r5qdoAABwk2E7dThTDpXClt8OymYl+SpJQSFm1UqK9HcYAAAAZzD7OwB3JTcv1KTZlS+RJ51K5GUeCLGfF+QV6vOXvvFjRAAAANKWZYf9HULAyEzP095NWf4OAwAA4AyVMpkXGWPVU7N3KaZm5UvkHdoborcmJp5xff6sxSoqKPJDRAAAuOn0BhjOHKgU8k7QF/mzjUsO+DsEAACAM1TKZN49Ew6oTr0Sf4fhMptNmjqqgYoKLGfcyz2Wp0UfLvNDVAAAuMlmOH8AlVD6pizlHC3wdxgAAAAOKl0yr/PFOep7/XF/h+GWr2fW0vpfy19Eee5r83wYDQAAAM7GMKRNPzM6DwAABJZKlcyLiLbqgWf3+zsMt/y2IEavja931jJbf9+pnGO5PooIAIAKYpotqoG9m475OwQAAAAHlSqZN2jYUcX/f3v3HxT1fedx/LXsD1hxWdwArkRAQMECh0FraWJMsCEGo+kdyUwqitGStKn3I2Njctcfnk0TM502mfPumjR/9PiRuYvJaM9zziYznbR6R35NZjxKUtsmarFRgtYai8iPlR/7vT867pVjgd2V5btfeD5mvn/w3c9+eX8Zx33z4vP5frKtt7z22H/N1Z4v5yk4Ypt07Mn/6ZiGigAAmAKGIgzzzC4UiN0fuvo0PGS95zQDAICZyzJhXlKSobu3WG9HsTd+7NW3tuZr6GpkP+oTxwjzAAAWwcw8zALBoKGLnb1mlwEAABDiMLuASFXe2aP5C60zKy/Qn6Tm7/h1qDFD0uQz8q452fab+BUFAACAqP3+oyvy53vNLgMAAECShcK8DVsvml1CxD4+7dKuzQXq+m1y1O89+2FXHCoCACAOgkFJwQjHAdbV/bt+s0sAAAAIsUSYl+weUcVq6yxvOPeRM6YgT5Ku9g9OcTUAAMRJpEtoWWYLixse5Jl5AAAgcVjimXmFZQHZ7WZXEbkl5YGY3zs8ODyFlQAAAOB6jYwQSAMAgMRhiTBvSbm1ljZ4fSPKujG2GXYOlyUmSwIAwAYYmDXs9siffwwAABBvlgjzisoHzC4harEGkMlzXFNcCQAAcRI0Ij8AC3O4LLREBAAAzHiWCPPyimNftmqWRUtjqzmnOHuKKwEAAMD1SJ8/x+wSAAAAQiyxpjMl1Xq74MVa85LlhVNcCQAA8WEYQRnG5J93kYxBYkh2OyT2ehgjM89jdgkAAAAhlpiZ53RZ75cApyu2JUVFny6Y4koAAIgTI8IltjwzzzKKVvrNLiHhJCXZlLFwrtllAAAAhJga5mXlZUQ0bmTIeg8dHhmOreYlKwjzAACAOT516wLJem1XXM3LTpXDyTPzAABA4jA1zLtr25qIxl0NWGIC4ShXB6LvhItXFirNxzIOAIBFsJvtjJOeNUc5S+eZXUZCySv1mV0CAADAKKamZGu3VcmbMXl4df6M9XZ4PX8mOer33LP9rjhUAgBAnASDkR+wjOU1i8wuIWHYbFLp6hvNLgMAAGAUU8O8NJ9Hf/39Bycdd/J96+0gdvJ9d1TjPb65WrNxVZyqAQAgDpiZNyMtLJ6nktXZZpeREHLLblBaRnQ9HQAAQLyZvn616gurtPq+ygnHRBuMmS0wYNNHJ1Kies/arVVypVhvBiIAAJh5Vt23WB5fdL3MTFR2G7PyAABA4jE9zJOkv3n+SxMut7VamNfxS7eCI5E/M2+Ox617d6yPY0UAAEw9IxiM+IC1uFIcWrNlqdllmCorz6O80hvMLgMAAGCMhAjz5mV5tftHjynZHX5mWvdFpz76MPpn0Jnl/bfnRjX+4WcfUFZOZDv7AgCQMFhmO6PlfMqnm2sLzS7DFHZHku7YWiJbElv7AgCAxJMQYZ4kld9Wor8/sFNOlyPs6z/+V2v8ZXRkRHrt3yKvdfmd5br7S9VxrAgAACA2y+/K04qaPLPLmHYrNyySLzvV7DIAAADCSpgwT5Iq716upw5/TSmpY2fh/XS/TwN9CVVuWMeOePS7zsiefTcnza2dP/xKnCsCACBOgkbkByzrs39RqMrP55tdxrTJyvOoYu3sCzABAIB1JFw6tuLOZdrb+pQWleWMOt/fa9eRg+nmFBWFwy9GtlzW6XLoWz96TFm5mXGuCACAODEMyQhGcBDmWd2n787XnQ+WKDk1/AqKmcLjS9G6r/yZklheCwAAEljChXmStLgiXz849l3Vfb1Wdoc9dP4/fpipwauJ21x1/DJFx46Ov5HHNQ6nXd94eYeWV5dPQ1UAAADXr2ilX3W7K7WofGY+5zfV69Lnd9ykufPYxRcAACS2hAzzJMnpcqrh6U3653eeVsnNRZKks6dStG/vfJMrC294SHr2qzkyjInDxmS3S98+9He6tbZymioDACA+jKAR8YGZIdWbrPV/Wa7qL5bI45va0CspOKSM378n+/DAlF43EmmZbt37+AqlZ82Z9u8NAAAQrYRfK1G0olD/9NbTOtnWof/8wU90qOkN3bLusoqWTX+jN5FXvj9fvzk+cQNYUJ6nx1v+Sotvmj3PnQEAzGBGUFIwwnGYSYor/SpaOV+//cVFHf/vj3XmV5divlbKwEXd2PWGFpx/R66hPvW5s/TrpVvU4y2YworHV7g8U7fXFcvtieyZxwAAAGZL+DDvmiXLC7TzX7br4WcfUNtP/l0Fw9+RwzFidlmS/ri8dt8/jj9j0O6wq+7rtdq86z45nJb5kQMAAIzLlmRT/rJM5S/LVPeFfv2q5XV9/M6HuuLJ1YA7Q7KFXwDiGOqT58oZeXrPal73Sfku/Vo2/d/szdSBC1rx83/QmZw7dHrRegXt8QnZ3B6nbttYrMUrsuJyfQAAgHixXLI0Nz1Vt33hARkDHhmXvybJ3KU7ly449NSXF2lkOPzy2pvWlOrhZ7dqcQWz8QAAM4sRNGTYJv8cNtgAY8ZLz5qjW/72z9X1jW/q8sFva9ieot7UbI3YkxVMcspmjCgpOCR34BO5A59Mej2bDOWd/akyPvmFThXeq098JeOGg9GyO5K05DPzdUttIbPxAACAJZkS5l1r6nt6eq7jKnfIMB6V0fu9qSkqBle67dr9QI7OdCRJGgqdn+N1a83GW1Xz4OeUW3yjpOu9VwAApt+1z67xwrhh42pES2iH/+QzEolpanozKfXxx3T50iUFfvYzOQZPjmk0RyT1RnPBK10qbH9O2Sk+nfPfovP+z2jImRpTbWk3pGjpzQtU/Fm/3HNdGjICGuoJxHQtAADMMll/htnBZpjwL6Czs1M5OTnT/W0BAEAMzp49q4ULF4a+DgQCys/P1/nz5yO+ht/v1+nTp5WSwk6hiYjeDAAAa/n//RlmF1PCvGAwqK6uLnk8HtlsE+/+CgAAzGEYhq5cuaLs7GwlJY1e4hgIBDQ4OBjxtVwuF0FeAqM3AwDAGibqzzB7mBLmAQAAAAAAAIgeMS4AAAAAAABgEYR5AAAAAAAAgEUQ5gEAAAAAAAAWQZgHAAAAAAAAWARhHjDLVFVVaceOHWPOt7S0KD09XZL0xBNPyGazqaamZsy4Z555RjabTVVVVWNe6+zslMvlUllZWdjvbbPZQofX69WqVat05MiR0Outra265557lJ2dLZvNpkOHDsVyiwAAAJZBbwYAiBZhHoCwFixYoKNHj6qzs3PU+aamJuXm5oZ9T0tLi+6//3719PTo3XffDTumublZ586d01tvvaWMjAxt2LBBHR0dkqS+vj4tW7ZMzz///NTeDAAAgMXRmwEAriHMAxBWVlaW1q5dqxdffDF07u2339bFixe1fv36MeMNw1Bzc7O2bNmiTZs2qbGxMex109PT5ff7VVZWphdeeEEDAwN6/fXXJUnr1q3Tnj17VFtbG5+bAgAAsCh6MwDANYR5AMbV0NCglpaW0NdNTU3avHmzXC7XmLFHjx5Vf3+/qqurVV9fr1deeUV9fX0TXt/tdkuSBgcHp7RuAACAmYjeDAAgEeYBmMCGDRvU09Oj1tZW9fX1af/+/WpoaAg7trGxURs3bpTdbldZWZkKCgp04MCBca/d39+vXbt2yW636/bbb4/XLQAAAMwY9GYAAElymF0AgMTldDpVX1+v5uZmdXR0qKioSOXl5WPGdXd36+DBg3rzzTdD5+rr69XY2Kht27aNGltXVye73a6BgQFlZmaqsbEx7DUBAAAwGr0ZAEAizANmnbS0NF2+fHnM+e7ubnm93jHnGxoaVFlZqePHj4/7l999+/YpEAiosrIydM4wDAWDQZ04cUJFRUWh83v37lV1dbW8Xq8yMzOn4I4AAACsi94MABAtltkCs0xxcbHa2trGnG9raxvV2F1TWlqq0tJSHT9+XJs2bQp7zcbGRu3cuVPt7e2h47333tPq1avV1NQ0aqzf79fixYtpFgEAAERvBgCIHjPzgFlm+/bteu655/TII4/ooYceUnJysl599VW9/PLLOnz4cNj3HDlyRENDQ0pPTx/zWnt7u9ra2vTSSy9p6dKlo16rq6vTk08+qT179sjhmPy/m97eXp06dSr09enTp9Xe3i6fz6fc3NzobhQAAMAC6M0AANFiZh4wyxQUFKi1tVUffPCBqqurVVlZqf379+vAgQOqqakJ+57U1NSwzaL0x7/8lpSUjGkWJam2tlYXLlzQa6+9FlFtx44dU0VFhSoqKiRJjz76qCoqKrR79+7Ibg4AAMBi6M0AANGyGYZhmF0EAAAAAAAAgMkxMw8AAAAAAACwCMI8AAAAAAAAwCII8wAAAAAAAACLIMwDAAAAAAAALIIwDwAAAAAAALAIwjwAAAAAAADAIgjzAAAAAAAAAIsgzAMAAAAAAAAsgjAPAAAAAAAAsAjCPAAAAAAAAMAiCPMAAAAAAAAAiyDMAwAAAAAAACzifwE00MCs0PUzbwAAAABJRU5ErkJggg==",
+      "text/plain": [
+       "<Figure size 1455.6x480 with 3 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "ep.pp.neighbors(edata, use_rep=\"saits_latent\")\n",
+    "ep.tl.umap(edata)\n",
+    "ep.pl.umap(edata, color=[\"gender_concept_id\", \"race_source_value\"])"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "ehrapy_venv_oct",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From 929644a2c87bc3d15001e932472e57eab3dd5ee2 Mon Sep 17 00:00:00 2001
From: Eljas Roellin <eljas.roellin@ikmail.com>
Date: Thu, 21 Nov 2024 09:08:32 +0100
Subject: [PATCH 31/43] add pypots to tutorial index

---
 docs/index.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/index.md b/docs/index.md
index da01b06..7065d92 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -16,4 +16,5 @@ notebooks/omop_tables_tutorial
 notebooks/cohort_definition
 notebooks/study_design_example_omop_cdm
 notebooks/indwelling_arterial_catheters
+notebooks/tutorial_time_series_with_pypots
 ```

From 3746c24bc600861342ae273d48a1ca35974098a9 Mon Sep 17 00:00:00 2001
From: Eljas Roellin <eljas.roellin@ikmail.com>
Date: Thu, 21 Nov 2024 18:15:00 +0100
Subject: [PATCH 32/43] add one-hot presence encoding for a feature

---
 src/ehrdata/io/omop/_queries.py | 15 +++++++++++----
 src/ehrdata/io/omop/omop.py     |  2 +-
 tests/test_io/test_omop.py      |  6 +++---
 3 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/src/ehrdata/io/omop/_queries.py b/src/ehrdata/io/omop/_queries.py
index 2975231..c4b24f8 100644
--- a/src/ehrdata/io/omop/_queries.py
+++ b/src/ehrdata/io/omop/_queries.py
@@ -77,8 +77,11 @@ def _drop_timedeltas(backend_handle: duckdb.duckdb.DuckDBPyConnection):
 
 
 def _generate_value_query(data_table: str, data_field_to_keep: Sequence, aggregation_strategy: str) -> str:
-    query = f"{', ' .join([f'CASE WHEN COUNT(*) = 0 THEN NULL ELSE {aggregation_strategy}({column}) END AS {column}' for column in data_field_to_keep])}"
-    return query
+    # is_present is 1 in all rows of the data_table; but need an aggregation operation, so use LAST
+    is_present_query = "LAST(is_present) as is_present, "
+    value_query = f"{', ' .join([f'{aggregation_strategy}({column}) AS {column}' for column in data_field_to_keep])}"
+
+    return is_present_query + value_query
 
 
 def time_interval_table_query_long_format(
@@ -137,10 +140,14 @@ def time_interval_table_query_long_format(
             SELECT person_id, data_table_concept_id, interval_step, start_date, start_date + interval_start_offset as interval_start, start_date + interval_end_offset as interval_end \
             FROM long_format_backbone \
             CROSS JOIN timedeltas \
+        ), \
+        data_table_with_presence_indicator as( \
+            SELECT *, 1 as is_present \
+            FROM {data_table} \
         ) \
-        SELECT lfi.person_id, lfi.data_table_concept_id, interval_step, interval_start, interval_end, {_generate_value_query(data_table, data_field_to_keep, AGGREGATION_STRATEGY_KEY[aggregation_strategy])} \
+        SELECT lfi.person_id, lfi.data_table_concept_id, interval_step, interval_start, interval_end, {_generate_value_query("data_table_with_presence_indicator", data_field_to_keep, AGGREGATION_STRATEGY_KEY[aggregation_strategy])} \
         FROM long_format_intervals as lfi \
-        LEFT JOIN {data_table} ON lfi.person_id = {data_table}.person_id AND lfi.data_table_concept_id = {data_table}.{DATA_TABLE_CONCEPT_ID_TRUNK[data_table]}_concept_id AND {data_table}.{data_table}_{date_prefix}date BETWEEN lfi.interval_start AND lfi.interval_end \
+        LEFT JOIN data_table_with_presence_indicator ON lfi.person_id = data_table_with_presence_indicator.person_id AND lfi.data_table_concept_id = data_table_with_presence_indicator.{DATA_TABLE_CONCEPT_ID_TRUNK[data_table]}_concept_id AND data_table_with_presence_indicator.{data_table}_{date_prefix}date BETWEEN lfi.interval_start AND lfi.interval_end \
         GROUP BY lfi.person_id, lfi.data_table_concept_id, interval_step, interval_start, interval_end
         """
     ).df()
diff --git a/src/ehrdata/io/omop/omop.py b/src/ehrdata/io/omop/omop.py
index 876b9d2..8c67c31 100644
--- a/src/ehrdata/io/omop/omop.py
+++ b/src/ehrdata/io/omop/omop.py
@@ -275,7 +275,7 @@ def setup_variables(
     data_tables
         The table to be used. Only a single table can be used.
     data_field_to_keep
-        The CDM Field in the data table to be kept. Can be e.g. "value_as_number" or "value_as_concept_id".
+        The CDM Field in the data table to be kept. Can be e.g. "value_as_number" or "value_as_concept_id". Importantly, can be "is_present" to have a one-hot encoding of the presence of the feature in a patient in an interval.
     start_time
         Starting time for values to be included.
     interval_length_number
diff --git a/tests/test_io/test_omop.py b/tests/test_io/test_omop.py
index 41a99ba..62773e9 100644
--- a/tests/test_io/test_omop.py
+++ b/tests/test_io/test_omop.py
@@ -92,9 +92,9 @@ def test_setup_obs_invalid_observation_table_value(omop_connection_vanilla):
 @pytest.mark.parametrize(
     "data_tables,data_field_to_keep",
     [
-        (["measurement"], ["value_as_number", "value_as_concept_id"]),
-        (["observation"], ["value_as_number", "value_as_concept_id"]),
-        (["specimen"], ["quantity"]),
+        (["measurement"], ["value_as_number", "value_as_concept_id", "is_present"]),
+        (["observation"], ["value_as_number", "value_as_concept_id", "is_present"]),
+        (["specimen"], ["quantity", "is_present"]),
     ],
 )
 @pytest.mark.parametrize(

From ac0c89b8130506d1de3f2f3b14a24f04d959160c Mon Sep 17 00:00:00 2001
From: Eljas Roellin <eljas.roellin@ikmail.com>
Date: Thu, 21 Nov 2024 19:16:29 +0100
Subject: [PATCH 33/43] add first basic interval-variable support for
 drug_exposure

---
 src/ehrdata/io/omop/_queries.py | 77 +++++++++++++++++++++++++++++++++
 src/ehrdata/io/omop/omop.py     | 52 ++++++++++++++--------
 tests/test_io/test_omop.py      | 15 ++++---
 3 files changed, 119 insertions(+), 25 deletions(-)

diff --git a/src/ehrdata/io/omop/_queries.py b/src/ehrdata/io/omop/_queries.py
index c4b24f8..080c8e3 100644
--- a/src/ehrdata/io/omop/_queries.py
+++ b/src/ehrdata/io/omop/_queries.py
@@ -155,3 +155,80 @@ def time_interval_table_query_long_format(
     _drop_timedeltas(backend_handle)
 
     return df
+
+
+def time_interval_table_for_interval_tables_query_long_format(
+    backend_handle: duckdb.duckdb.DuckDBPyConnection,
+    time_defining_table: str,
+    data_table: str,
+    interval_length_number: int,
+    interval_length_unit: str,
+    num_intervals: int,
+    aggregation_strategy: str,
+    data_field_to_keep: Sequence[str] | str,
+    date_prefix: str = "",
+) -> pd.DataFrame:
+    """Returns a long format DataFrame from the data_table. The following columns should be considered the indices of this long format: person_id, data_table_concept_id, interval_step. The other columns, except for start_date and end_date, should be considered the values."""
+    if isinstance(data_field_to_keep, str):
+        data_field_to_keep = [data_field_to_keep]
+
+    if date_prefix != "":
+        date_prefix = date_prefix + "_"
+
+    timedeltas_dataframe = _generate_timedeltas(interval_length_number, interval_length_unit, num_intervals)
+
+    _write_timedeltas_to_db(
+        backend_handle,
+        timedeltas_dataframe,
+    )
+
+    # multi-step query
+    # 1. Create person_time_defining_table, which matches the one created for obs. Needs to contain the person_id, and the start date in particular.
+    # 2. Create person_data_table (data_table is typically measurement), which contains the cross product of person_id and the distinct concept_id s.
+    # 3. Create long_format_backbone, which is the left join of person_time_defining_table and person_data_table.
+    # 4. Create long_format_intervals, which is the cross product of long_format_backbone and timedeltas. This table contains most notably the person_id, the concept_id, the interval start and end dates.
+    # 5. Create the final table, which is the join with the data_table (typically measurement); each measurement is assigned to its person_id, its concept_id, and the interval it fits into.
+    df = backend_handle.execute(
+        f"""
+        WITH person_time_defining_table AS ( \
+            SELECT person.person_id as person_id, {START_DATE_KEY[time_defining_table]} as start_date, {END_DATE_KEY[time_defining_table]} as end_date \
+            FROM person \
+            JOIN {time_defining_table} ON person.person_id = {time_defining_table}.{TIME_DEFINING_TABLE_SUBJECT_KEY[time_defining_table]} \
+        ), \
+        person_data_table AS( \
+            WITH distinct_data_table_concept_ids AS ( \
+                SELECT DISTINCT {DATA_TABLE_CONCEPT_ID_TRUNK[data_table]}_concept_id
+                FROM {data_table} \
+            )
+            SELECT person.person_id, {DATA_TABLE_CONCEPT_ID_TRUNK[data_table]}_concept_id as data_table_concept_id \
+            FROM person \
+            CROSS JOIN distinct_data_table_concept_ids \
+        ), \
+        long_format_backbone as ( \
+            SELECT person_time_defining_table.person_id, data_table_concept_id, start_date, end_date \
+            FROM person_time_defining_table \
+            LEFT JOIN person_data_table USING(person_id)\
+        ), \
+        long_format_intervals as ( \
+            SELECT person_id, data_table_concept_id, interval_step, start_date, start_date + interval_start_offset as interval_start, start_date + interval_end_offset as interval_end \
+            FROM long_format_backbone \
+            CROSS JOIN timedeltas \
+        ), \
+        data_table_with_presence_indicator as( \
+            SELECT *, 1 as is_present \
+            FROM {data_table} \
+        ) \
+        SELECT lfi.person_id, lfi.data_table_concept_id, interval_step, interval_start, interval_end, {_generate_value_query("data_table_with_presence_indicator", data_field_to_keep, AGGREGATION_STRATEGY_KEY[aggregation_strategy])} \
+        FROM long_format_intervals as lfi \
+        LEFT JOIN data_table_with_presence_indicator ON lfi.person_id = data_table_with_presence_indicator.person_id \
+                AND lfi.data_table_concept_id = data_table_with_presence_indicator.{DATA_TABLE_CONCEPT_ID_TRUNK[data_table]}_concept_id \
+                AND (data_table_with_presence_indicator.{data_table}_start_date BETWEEN lfi.interval_start AND lfi.interval_end \
+                    OR data_table_with_presence_indicator.{data_table}_end_date BETWEEN lfi.interval_start AND lfi.interval_end \
+                    OR (data_table_with_presence_indicator.{data_table}_start_date < lfi.interval_start AND data_table_with_presence_indicator.{data_table}_end_date > lfi.interval_end)) \
+        GROUP BY lfi.person_id, lfi.data_table_concept_id, interval_step, interval_start, interval_end
+        """
+    ).df()
+
+    _drop_timedeltas(backend_handle)
+
+    return df
diff --git a/src/ehrdata/io/omop/omop.py b/src/ehrdata/io/omop/omop.py
index 8c67c31..176f9af 100644
--- a/src/ehrdata/io/omop/omop.py
+++ b/src/ehrdata/io/omop/omop.py
@@ -32,6 +32,7 @@
     _check_valid_variable_data_tables,
 )
 from ehrdata.io.omop._queries import (
+    time_interval_table_for_interval_tables_query_long_format,
     time_interval_table_query_long_format,
 )
 from ehrdata.utils._omop_utils import get_table_catalog_dict
@@ -396,7 +397,7 @@ def setup_interval_variables(
     *,
     backend_handle: duckdb.duckdb.DuckDBPyConnection,
     data_tables: Sequence[Literal["drug_exposure"]] | Literal["drug_exposure"],
-    data_field_to_keep: str | Sequence[str] | Literal["one-hot"],
+    data_field_to_keep: str | Sequence[str],
     interval_length_number: int,
     interval_length_unit: str,
     num_intervals: int,
@@ -421,7 +422,7 @@ def setup_interval_variables(
     data_tables
         The table to be used. Only a single table can be used.
     data_field_to_keep
-        The CDM Field in the data table to be kept. Can be e.g. "value_as_number" or "value_as_concept_id".
+        The CDM Field in the data table to be kept. Can be e.g. "value_as_number" or "value_as_concept_id".  Importantly, can be "is_present" to have a one-hot encoding of the presence of the feature in a patient in an interval.
     start_time
         Starting time for values to be included.
     interval_length_number
@@ -469,24 +470,37 @@ def setup_interval_variables(
         return edata
 
     if keep_date == "start" or keep_date == "end":
-        date_prefix = keep_date
-    else:
-        raise NotImplementedError("support interval extraction coming soon")
-    ds = (
-        time_interval_table_query_long_format(
-            backend_handle=backend_handle,
-            time_defining_table=time_defining_table,
-            data_table=data_tables[0],
-            data_field_to_keep=data_field_to_keep,
-            interval_length_number=interval_length_number,
-            interval_length_unit=interval_length_unit,
-            num_intervals=num_intervals,
-            aggregation_strategy=aggregation_strategy,
-            date_prefix=date_prefix,
+        ds = (
+            time_interval_table_for_interval_tables_query_long_format(
+                backend_handle=backend_handle,
+                time_defining_table=time_defining_table,
+                data_table=data_tables[0],
+                data_field_to_keep=data_field_to_keep,
+                interval_length_number=interval_length_number,
+                interval_length_unit=interval_length_unit,
+                num_intervals=num_intervals,
+                aggregation_strategy=aggregation_strategy,
+                date_prefix=keep_date,
+            )
+            .set_index(["person_id", "data_table_concept_id", "interval_step"])
+            .to_xarray()
+        )
+    elif keep_date == "interval":
+        ds = (
+            time_interval_table_for_interval_tables_query_long_format(
+                backend_handle=backend_handle,
+                time_defining_table=time_defining_table,
+                data_table=data_tables[0],
+                data_field_to_keep=data_field_to_keep,
+                interval_length_number=interval_length_number,
+                interval_length_unit=interval_length_unit,
+                num_intervals=num_intervals,
+                aggregation_strategy=aggregation_strategy,
+                date_prefix=keep_date,
+            )
+            .set_index(["person_id", "data_table_concept_id", "interval_step"])
+            .to_xarray()
         )
-        .set_index(["person_id", "data_table_concept_id", "interval_step"])
-        .to_xarray()
-    )
 
     var = ds["data_table_concept_id"].to_dataframe()
 
diff --git a/tests/test_io/test_omop.py b/tests/test_io/test_omop.py
index 62773e9..706772e 100644
--- a/tests/test_io/test_omop.py
+++ b/tests/test_io/test_omop.py
@@ -139,23 +139,26 @@ def test_setup_variables(
 
 @pytest.mark.parametrize(
     "observation_table",
-    [
-        "person_cohort",
-    ],  # "person_observation_period", "person_visit_occurrence"],
+    ["person_cohort", "person_observation_period", "person_visit_occurrence"],
 )
 @pytest.mark.parametrize(
     "data_tables,data_field_to_keep",
     [
-        (["drug_exposure"], ["days_supply"]),  # ["one-hot"]
+        (["drug_exposure"], ["days_supply"]),
+        (["drug_exposure"], ["is_present"]),
+        # (["condition_occurrence"], ["is_present"]), # TODO: write test file
+        # (["procedure_occurrence"], ["is_present"]), # TODO: write test file
+        # (["device_exposure"], ["is_present"]), # TODO: write test file
+        # (["note"], ["is_present"]),
     ],
 )
 @pytest.mark.parametrize(
     "enrich_var_with_feature_info",
-    [False],  # True,
+    [False, True],
 )
 @pytest.mark.parametrize(
     "keep_date",
-    ["start", "end"],  # "interval"
+    ["start", "end", "interval"],
 )
 def test_setup_interval_variables(
     omop_connection_vanilla,

From 0933889cd7f3dd14c72d0b2dedb040bf98b9d7be Mon Sep 17 00:00:00 2001
From: Eljas Roellin <eljas.roellin@ikmail.com>
Date: Thu, 21 Nov 2024 19:48:11 +0100
Subject: [PATCH 34/43] add strict tests for setup_variables

---
 tests/test_io/test_omop.py | 64 +++++++++++++++++++++++++++++++++++---
 1 file changed, 60 insertions(+), 4 deletions(-)

diff --git a/tests/test_io/test_omop.py b/tests/test_io/test_omop.py
index 706772e..c43e57f 100644
--- a/tests/test_io/test_omop.py
+++ b/tests/test_io/test_omop.py
@@ -1,5 +1,6 @@
 import re
 
+import numpy as np
 import pytest
 
 import ehrdata as ed
@@ -89,12 +90,64 @@ def test_setup_obs_invalid_observation_table_value(omop_connection_vanilla):
     "observation_table",
     ["person_cohort", "person_observation_period", "person_visit_occurrence"],
 )
+# test 1 field from table, and is_present encoding
 @pytest.mark.parametrize(
-    "data_tables,data_field_to_keep",
+    "data_tables,data_field_to_keep,target_r",
     [
-        (["measurement"], ["value_as_number", "value_as_concept_id", "is_present"]),
-        (["observation"], ["value_as_number", "value_as_concept_id", "is_present"]),
-        (["specimen"], ["quantity", "is_present"]),
+        (
+            ["measurement"],
+            ["value_as_number"],
+            [
+                [[np.nan, np.nan, np.nan, np.nan], [18.0, np.nan, np.nan, np.nan]],
+                [[np.nan, np.nan, np.nan, np.nan], [20.0, np.nan, np.nan, np.nan]],
+                [[np.nan, np.nan, np.nan, np.nan], [22.0, np.nan, np.nan, np.nan]],
+            ],
+        ),
+        (
+            ["measurement"],
+            ["is_present"],
+            [
+                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
+                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
+                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
+            ],
+        ),
+        (
+            ["observation"],
+            ["value_as_number"],
+            [
+                [[np.nan, np.nan, np.nan, np.nan], [3, np.nan, np.nan, np.nan]],
+                [[np.nan, np.nan, np.nan, np.nan], [4, np.nan, np.nan, np.nan]],
+                [[np.nan, np.nan, np.nan, np.nan], [5, np.nan, np.nan, np.nan]],
+            ],
+        ),
+        (
+            ["observation"],
+            ["is_present"],
+            [
+                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
+                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
+                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
+            ],
+        ),
+        (
+            ["specimen"],
+            ["quantity"],
+            [
+                [[0.5, np.nan, np.nan, np.nan], [1.5, np.nan, np.nan, np.nan]],
+                [[0.5, np.nan, np.nan, np.nan], [1.5, np.nan, np.nan, np.nan]],
+                [[0.5, np.nan, np.nan, np.nan], [1.5, np.nan, np.nan, np.nan]],
+            ],
+        ),
+        (
+            ["specimen"],
+            ["is_present"],
+            [
+                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
+                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
+                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
+            ],
+        ),
     ],
 )
 @pytest.mark.parametrize(
@@ -112,6 +165,7 @@ def test_setup_variables(
     data_field_to_keep,
     enrich_var_with_feature_info,
     enrich_var_with_unit_info,
+    target_r,
 ):
     num_intervals = 4
     con = omop_connection_vanilla
@@ -136,6 +190,8 @@ def test_setup_variables(
         VAR_DIM_UNIT_INFO if enrich_var_with_unit_info else 0
     )
 
+    assert np.allclose(edata.r, np.array(target_r), equal_nan=True)
+
 
 @pytest.mark.parametrize(
     "observation_table",

From 5b387062cdb7d634633e61737c45d768f98bf78e Mon Sep 17 00:00:00 2001
From: Eljas Roellin <eljas.roellin@ikmail.com>
Date: Thu, 21 Nov 2024 20:00:27 +0100
Subject: [PATCH 35/43] add tight test for interval vars drug_exposure table

---
 src/ehrdata/io/omop/omop.py |  2 +-
 tests/test_io/test_omop.py  | 74 +++++++++++++++++++++++++++++++++----
 2 files changed, 67 insertions(+), 9 deletions(-)

diff --git a/src/ehrdata/io/omop/omop.py b/src/ehrdata/io/omop/omop.py
index 176f9af..85d8de7 100644
--- a/src/ehrdata/io/omop/omop.py
+++ b/src/ehrdata/io/omop/omop.py
@@ -471,7 +471,7 @@ def setup_interval_variables(
 
     if keep_date == "start" or keep_date == "end":
         ds = (
-            time_interval_table_for_interval_tables_query_long_format(
+            time_interval_table_query_long_format(
                 backend_handle=backend_handle,
                 time_defining_table=time_defining_table,
                 data_table=data_tables[0],
diff --git a/tests/test_io/test_omop.py b/tests/test_io/test_omop.py
index c43e57f..1ef28c9 100644
--- a/tests/test_io/test_omop.py
+++ b/tests/test_io/test_omop.py
@@ -197,11 +197,70 @@ def test_setup_variables(
     "observation_table",
     ["person_cohort", "person_observation_period", "person_visit_occurrence"],
 )
+# test 1 field from table, and is_present encoding, with start, end, and interval
 @pytest.mark.parametrize(
-    "data_tables,data_field_to_keep",
+    "data_tables,data_field_to_keep,keep_date,target_r",
     [
-        (["drug_exposure"], ["days_supply"]),
-        (["drug_exposure"], ["is_present"]),
+        (
+            ["drug_exposure"],
+            ["days_supply"],
+            "start",
+            [
+                [[31.0, np.nan, np.nan, np.nan], [31.0, np.nan, np.nan, np.nan]],
+                [[31.0, np.nan, np.nan, np.nan], [31.0, np.nan, np.nan, np.nan]],
+                [[31.0, np.nan, np.nan, np.nan], [31.0, np.nan, np.nan, np.nan]],
+            ],
+        ),
+        (
+            ["drug_exposure"],
+            ["days_supply"],
+            "end",
+            [
+                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
+                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
+                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
+            ],
+        ),
+        (
+            ["drug_exposure"],
+            ["days_supply"],
+            "interval",
+            [
+                [[31.0, 31.0, 31.0, 31.0], [31.0, 31.0, 31.0, 31.0]],
+                [[31.0, 31.0, 31.0, 31.0], [31.0, 31.0, 31.0, 31.0]],
+                [[31.0, 31.0, 31.0, 31.0], [31.0, 31.0, 31.0, 31.0]],
+            ],
+        ),
+        (
+            ["drug_exposure"],
+            ["is_present"],
+            "start",
+            [
+                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
+                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
+                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
+            ],
+        ),
+        (
+            ["drug_exposure"],
+            ["is_present"],
+            "end",
+            [
+                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
+                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
+                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
+            ],
+        ),
+        (
+            ["drug_exposure"],
+            ["is_present"],
+            "interval",
+            [
+                [[1, 1, 1, 1], [1, 1, 1, 1]],
+                [[1, 1, 1, 1], [1, 1, 1, 1]],
+                [[1, 1, 1, 1], [1, 1, 1, 1]],
+            ],
+        ),
         # (["condition_occurrence"], ["is_present"]), # TODO: write test file
         # (["procedure_occurrence"], ["is_present"]), # TODO: write test file
         # (["device_exposure"], ["is_present"]), # TODO: write test file
@@ -212,15 +271,12 @@ def test_setup_variables(
     "enrich_var_with_feature_info",
     [False, True],
 )
-@pytest.mark.parametrize(
-    "keep_date",
-    ["start", "end", "interval"],
-)
-def test_setup_interval_variables(
+def test_setup_interval_type_variables(
     omop_connection_vanilla,
     observation_table,
     data_tables,
     data_field_to_keep,
+    target_r,
     enrich_var_with_feature_info,
     keep_date,
 ):
@@ -245,6 +301,8 @@ def test_setup_interval_variables(
     assert edata.r.shape[2] == num_intervals
     assert edata.var.shape[1] == VAR_DIM_BASE + (VAR_DIM_FEATURE_INFO if enrich_var_with_feature_info else 0)
 
+    assert np.allclose(edata.r, np.array(target_r), equal_nan=True)
+
 
 @pytest.mark.parametrize(
     "edata, backend_handle, data_tables, data_field_to_keep, interval_length_number, interval_length_unit, num_intervals, enrich_var_with_feature_info, enrich_var_with_unit_info, expected_error",

From 056340e0d7ce1fdd7320c0b36a31116e05b85ad1 Mon Sep 17 00:00:00 2001
From: Eljas Roellin <eljas.roellin@ikmail.com>
Date: Thu, 21 Nov 2024 20:44:06 +0100
Subject: [PATCH 36/43] support condition_occurrence

---
 src/ehrdata/io/omop/_check_arguments.py       |  2 +-
 src/ehrdata/io/omop/_queries.py               | 17 +++--
 .../toy_omop/vanilla/condition_occurrence.csv | 10 +++
 tests/test_io/test_omop.py                    | 62 ++++++++++++++++++-
 4 files changed, 85 insertions(+), 6 deletions(-)
 create mode 100644 tests/data/toy_omop/vanilla/condition_occurrence.csv

diff --git a/src/ehrdata/io/omop/_check_arguments.py b/src/ehrdata/io/omop/_check_arguments.py
index 8b145cf..cc89cae 100644
--- a/src/ehrdata/io/omop/_check_arguments.py
+++ b/src/ehrdata/io/omop/_check_arguments.py
@@ -12,7 +12,7 @@
 VALID_OBSERVATION_TABLES_SINGLE = ["person"]
 VALID_OBSERVATION_TABLES_JOIN = ["person_cohort", "person_observation_period", "person_visit_occurrence"]
 VALID_VARIABLE_TABLES = ["measurement", "observation", "specimen"]
-VALID_INTERVAL_VARIABLE_TABLES = ["drug_exposure"]
+VALID_INTERVAL_VARIABLE_TABLES = ["drug_exposure", "condition_occurrence"]
 VALID_KEEP_DATES = ["start", "end", "interval"]
 
 
diff --git a/src/ehrdata/io/omop/_queries.py b/src/ehrdata/io/omop/_queries.py
index 080c8e3..99bf9c0 100644
--- a/src/ehrdata/io/omop/_queries.py
+++ b/src/ehrdata/io/omop/_queries.py
@@ -24,7 +24,16 @@
     "observation": "observation",
     "specimen": "specimen",
     "drug_exposure": "drug",
+    "condition_occurrence": "condition",
 }
+DATA_TABLE_DATE_TRUNK = {
+    "measurement": "measurement",
+    "observation": "observation",
+    "specimen": "specimen",
+    "drug_exposure": "drug_exposure",
+    "condition_occurrence": "condition",
+}
+
 
 AGGREGATION_STRATEGY_KEY = {
     "last": "LAST",
@@ -147,7 +156,7 @@ def time_interval_table_query_long_format(
         ) \
         SELECT lfi.person_id, lfi.data_table_concept_id, interval_step, interval_start, interval_end, {_generate_value_query("data_table_with_presence_indicator", data_field_to_keep, AGGREGATION_STRATEGY_KEY[aggregation_strategy])} \
         FROM long_format_intervals as lfi \
-        LEFT JOIN data_table_with_presence_indicator ON lfi.person_id = data_table_with_presence_indicator.person_id AND lfi.data_table_concept_id = data_table_with_presence_indicator.{DATA_TABLE_CONCEPT_ID_TRUNK[data_table]}_concept_id AND data_table_with_presence_indicator.{data_table}_{date_prefix}date BETWEEN lfi.interval_start AND lfi.interval_end \
+        LEFT JOIN data_table_with_presence_indicator ON lfi.person_id = data_table_with_presence_indicator.person_id AND lfi.data_table_concept_id = data_table_with_presence_indicator.{DATA_TABLE_CONCEPT_ID_TRUNK[data_table]}_concept_id AND data_table_with_presence_indicator.{DATA_TABLE_DATE_TRUNK[data_table]}_{date_prefix}date BETWEEN lfi.interval_start AND lfi.interval_end \
         GROUP BY lfi.person_id, lfi.data_table_concept_id, interval_step, interval_start, interval_end
         """
     ).df()
@@ -222,9 +231,9 @@ def time_interval_table_for_interval_tables_query_long_format(
         FROM long_format_intervals as lfi \
         LEFT JOIN data_table_with_presence_indicator ON lfi.person_id = data_table_with_presence_indicator.person_id \
                 AND lfi.data_table_concept_id = data_table_with_presence_indicator.{DATA_TABLE_CONCEPT_ID_TRUNK[data_table]}_concept_id \
-                AND (data_table_with_presence_indicator.{data_table}_start_date BETWEEN lfi.interval_start AND lfi.interval_end \
-                    OR data_table_with_presence_indicator.{data_table}_end_date BETWEEN lfi.interval_start AND lfi.interval_end \
-                    OR (data_table_with_presence_indicator.{data_table}_start_date < lfi.interval_start AND data_table_with_presence_indicator.{data_table}_end_date > lfi.interval_end)) \
+                AND (data_table_with_presence_indicator.{DATA_TABLE_DATE_TRUNK[data_table]}_start_date BETWEEN lfi.interval_start AND lfi.interval_end \
+                    OR data_table_with_presence_indicator.{DATA_TABLE_DATE_TRUNK[data_table]}_end_date BETWEEN lfi.interval_start AND lfi.interval_end \
+                    OR (data_table_with_presence_indicator.{DATA_TABLE_DATE_TRUNK[data_table]}_start_date < lfi.interval_start AND data_table_with_presence_indicator.{DATA_TABLE_DATE_TRUNK[data_table]}_end_date > lfi.interval_end)) \
         GROUP BY lfi.person_id, lfi.data_table_concept_id, interval_step, interval_start, interval_end
         """
     ).df()
diff --git a/tests/data/toy_omop/vanilla/condition_occurrence.csv b/tests/data/toy_omop/vanilla/condition_occurrence.csv
new file mode 100644
index 0000000..0efb7e0
--- /dev/null
+++ b/tests/data/toy_omop/vanilla/condition_occurrence.csv
@@ -0,0 +1,10 @@
+condition_occurrence_id,person_id,condition_concept_id,condition_start_date,condition_start_datetime,condition_end_date,condition_end_datetime,condition_type_concept_id,condition_status_concept_id,stop_reason,provider_id,visit_occurrence_id,visit_detail_id,condition_source_value,condition_source_concept_id,condition_status_source_value
+1,1,43530622,2100-01-01,2100-01-01 12:00:00,2100-01-31,2100-01-31 00:00:00,38000175,0,,0,31,,10,1121000119107,
+2,1,43530622,2100-02-01,2100-02-01 12:00:00,2100-02-28,2100-02-28 00:00:00,38000175,0,,0,28,,10,1121000119107,
+3,1,4112343,2100-01-01,2100-01-01 12:00:00,2100-01-31,2100-01-31 00:00:00,38000175,0,,0,31,,15,4112343,
+4,2,43530622,2100-01-01,2100-01-01 12:00:00,2100-01-31,2100-01-31 00:00:00,38000175,0,,0,31,,10,1121000119107,
+5,2,43530622,2100-02-01,2100-02-01 12:00:00,2100-02-28,2100-02-28 00:00:00,38000175,0,,0,28,,10,1121000119107,
+6,2,4112343,2100-01-01,2100-01-01 12:00:00,2100-01-31,2100-01-31 00:00:00,38000175,0,,0,31,,15,4112343,
+7,3,43530622,2100-01-01,2100-01-01 12:00:00,2100-01-31,2100-01-31 00:00:00,38000175,0,,0,31,,10,1121000119107,
+8,3,43530622,2100-02-01,2100-02-01 12:00:00,2100-02-28,2100-02-28 00:00:00,38000175,0,,0,28,,10,1121000119107,
+9,3,4112343,2100-01-01,2100-01-01 12:00:00,2100-01-31,2100-01-31 00:00:00,38000175,0,,0,31,,15,4112343,
diff --git a/tests/test_io/test_omop.py b/tests/test_io/test_omop.py
index 1ef28c9..6bc101d 100644
--- a/tests/test_io/test_omop.py
+++ b/tests/test_io/test_omop.py
@@ -16,6 +16,7 @@
     "observation": 2,
     "specimen": 2,
     "drug_exposure": 2,
+    "condition_occurrence": 2,
 }
 
 # constants for setup_variables
@@ -261,7 +262,66 @@ def test_setup_variables(
                 [[1, 1, 1, 1], [1, 1, 1, 1]],
             ],
         ),
-        # (["condition_occurrence"], ["is_present"]), # TODO: write test file
+        (
+            ["condition_occurrence"],
+            ["condition_source_value"],
+            "start",
+            [
+                [[15, np.nan, np.nan, np.nan], [10, np.nan, np.nan, np.nan]],
+                [[15, np.nan, np.nan, np.nan], [10, np.nan, np.nan, np.nan]],
+                [[15, np.nan, np.nan, np.nan], [10, np.nan, np.nan, np.nan]],
+            ],
+        ),
+        (
+            ["condition_occurrence"],
+            ["condition_source_value"],
+            "end",
+            [
+                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
+                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
+                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
+            ],
+        ),
+        (
+            ["condition_occurrence"],
+            ["condition_source_value"],
+            "interval",
+            [
+                [[15, 15, 15, 15], [10, 10, 10, 10]],
+                [[15, 15, 15, 15], [10, 10, 10, 10]],
+                [[15, 15, 15, 15], [10, 10, 10, 10]],
+            ],
+        ),
+        (
+            ["condition_occurrence"],
+            ["is_present"],
+            "start",
+            [
+                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
+                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
+                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
+            ],
+        ),
+        (
+            ["condition_occurrence"],
+            ["is_present"],
+            "end",
+            [
+                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
+                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
+                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
+            ],
+        ),
+        (
+            ["condition_occurrence"],
+            ["is_present"],
+            "interval",
+            [
+                [[1, 1, 1, 1], [1, 1, 1, 1]],
+                [[1, 1, 1, 1], [1, 1, 1, 1]],
+                [[1, 1, 1, 1], [1, 1, 1, 1]],
+            ],
+        ),
         # (["procedure_occurrence"], ["is_present"]), # TODO: write test file
         # (["device_exposure"], ["is_present"]), # TODO: write test file
         # (["note"], ["is_present"]),

From 85b0c1750e44deff2435d27065220c9fcfc86232 Mon Sep 17 00:00:00 2001
From: Eljas Roellin <eljas.roellin@ikmail.com>
Date: Fri, 22 Nov 2024 12:04:25 +0100
Subject: [PATCH 37/43] support procedure_occurrence

---
 src/ehrdata/io/omop/_check_arguments.py       |  2 +-
 src/ehrdata/io/omop/_queries.py               | 81 +++++++++++++------
 .../toy_omop/vanilla/procedure_occurrence.csv | 10 +++
 tests/test_io/test_omop.py                    | 62 +++++++++++++-
 4 files changed, 128 insertions(+), 27 deletions(-)
 create mode 100644 tests/data/toy_omop/vanilla/procedure_occurrence.csv

diff --git a/src/ehrdata/io/omop/_check_arguments.py b/src/ehrdata/io/omop/_check_arguments.py
index cc89cae..fcd2d78 100644
--- a/src/ehrdata/io/omop/_check_arguments.py
+++ b/src/ehrdata/io/omop/_check_arguments.py
@@ -12,7 +12,7 @@
 VALID_OBSERVATION_TABLES_SINGLE = ["person"]
 VALID_OBSERVATION_TABLES_JOIN = ["person_cohort", "person_observation_period", "person_visit_occurrence"]
 VALID_VARIABLE_TABLES = ["measurement", "observation", "specimen"]
-VALID_INTERVAL_VARIABLE_TABLES = ["drug_exposure", "condition_occurrence"]
+VALID_INTERVAL_VARIABLE_TABLES = ["drug_exposure", "condition_occurrence", "procedure_occurrence"]
 VALID_KEEP_DATES = ["start", "end", "interval"]
 
 
diff --git a/src/ehrdata/io/omop/_queries.py b/src/ehrdata/io/omop/_queries.py
index 99bf9c0..87eab08 100644
--- a/src/ehrdata/io/omop/_queries.py
+++ b/src/ehrdata/io/omop/_queries.py
@@ -3,16 +3,22 @@
 import duckdb
 import pandas as pd
 
-START_DATE_KEY = {
-    "visit_occurrence": "visit_start_date",
-    "observation_period": "observation_period_start_date",
-    "cohort": "cohort_start_date",
-}
-END_DATE_KEY = {
-    "visit_occurrence": "visit_end_date",
-    "observation_period": "observation_period_end_date",
-    "cohort": "cohort_end_date",
-}
+# START_DATE_KEY = {
+#     "visit_occurrence": "visit_start_date",
+#     "observation_period": "observation_period_start_date",
+#     "cohort": "cohort_start_date",
+#     "drug_exposure": "drug_exposure_start_date",
+#     "condition_occurrence": "condition_start_date",
+#     "procedure_occurrence": "procedure_date",  # v5.4, as v5.3 had no end! TODO: allow v5.3 too
+# }
+# END_DATE_KEY = {
+#     "visit_occurrence": "visit_end_date",
+#     "observation_period": "observation_period_end_date",
+#     "cohort": "cohort_end_date",
+#     "drug_exposure": "drug_exposure_end_date",
+#     "condition_occurrence": "condition_end_date",
+#     "procedure_occurrence": "procedure_end_date",  # v5.4, as v5.3 had no end! TODO: allow v5.3 too
+# }
 TIME_DEFINING_TABLE_SUBJECT_KEY = {
     "visit_occurrence": "person_id",
     "observation_period": "person_id",
@@ -25,15 +31,40 @@
     "specimen": "specimen",
     "drug_exposure": "drug",
     "condition_occurrence": "condition",
+    "procedure_occurrence": "procedure",
 }
-DATA_TABLE_DATE_TRUNK = {
-    "measurement": "measurement",
-    "observation": "observation",
-    "specimen": "specimen",
-    "drug_exposure": "drug_exposure",
-    "condition_occurrence": "condition",
-}
+# DATA_TABLE_DATE_TRUNK = {
+#     "measurement": "measurement",
+#     "observation": "observation",
+#     "specimen": "specimen",
+#     "drug_exposure": "drug_exposure",
+#     "condition_occurrence": "condition",
+#     "procedure_occurrence": "procedure",
+# }
 
+DATA_TABLE_DATE_KEYS = {
+    "timepoint": {
+        "measurement": "measurement_date",
+        "observation": "observation_date",
+        "specimen": "specimen_date",
+    },
+    "start": {
+        "visit_occurrence": "visit_start_date",
+        "observation_period": "observation_period_start_date",
+        "cohort": "cohort_start_date",
+        "drug_exposure": "drug_exposure_start_date",
+        "condition_occurrence": "condition_start_date",
+        "procedure_occurrence": "procedure_date",  # in v5.3, procedure didnt have end date
+    },
+    "end": {
+        "visit_occurrence": "visit_end_date",
+        "observation_period": "observation_period_end_date",
+        "cohort": "cohort_end_date",
+        "drug_exposure": "drug_exposure_end_date",
+        "condition_occurrence": "condition_end_date",
+        "procedure_occurrence": "procedure_end_date",  # in v5.3, procedure didnt have end date TODO v5.3 support
+    },
+}
 
 AGGREGATION_STRATEGY_KEY = {
     "last": "LAST",
@@ -108,8 +139,8 @@ def time_interval_table_query_long_format(
     if isinstance(data_field_to_keep, str):
         data_field_to_keep = [data_field_to_keep]
 
-    if date_prefix != "":
-        date_prefix = date_prefix + "_"
+    if date_prefix == "":
+        date_prefix = "timepoint"
 
     timedeltas_dataframe = _generate_timedeltas(interval_length_number, interval_length_unit, num_intervals)
 
@@ -127,7 +158,7 @@ def time_interval_table_query_long_format(
     df = backend_handle.execute(
         f"""
         WITH person_time_defining_table AS ( \
-            SELECT person.person_id as person_id, {START_DATE_KEY[time_defining_table]} as start_date, {END_DATE_KEY[time_defining_table]} as end_date \
+            SELECT person.person_id as person_id, {DATA_TABLE_DATE_KEYS["start"][time_defining_table]} as start_date, {DATA_TABLE_DATE_KEYS["end"][time_defining_table]} as end_date \
             FROM person \
             JOIN {time_defining_table} ON person.person_id = {time_defining_table}.{TIME_DEFINING_TABLE_SUBJECT_KEY[time_defining_table]} \
         ), \
@@ -156,7 +187,7 @@ def time_interval_table_query_long_format(
         ) \
         SELECT lfi.person_id, lfi.data_table_concept_id, interval_step, interval_start, interval_end, {_generate_value_query("data_table_with_presence_indicator", data_field_to_keep, AGGREGATION_STRATEGY_KEY[aggregation_strategy])} \
         FROM long_format_intervals as lfi \
-        LEFT JOIN data_table_with_presence_indicator ON lfi.person_id = data_table_with_presence_indicator.person_id AND lfi.data_table_concept_id = data_table_with_presence_indicator.{DATA_TABLE_CONCEPT_ID_TRUNK[data_table]}_concept_id AND data_table_with_presence_indicator.{DATA_TABLE_DATE_TRUNK[data_table]}_{date_prefix}date BETWEEN lfi.interval_start AND lfi.interval_end \
+        LEFT JOIN data_table_with_presence_indicator ON lfi.person_id = data_table_with_presence_indicator.person_id AND lfi.data_table_concept_id = data_table_with_presence_indicator.{DATA_TABLE_CONCEPT_ID_TRUNK[data_table]}_concept_id AND data_table_with_presence_indicator.{DATA_TABLE_DATE_KEYS[date_prefix][data_table]} BETWEEN lfi.interval_start AND lfi.interval_end \
         GROUP BY lfi.person_id, lfi.data_table_concept_id, interval_step, interval_start, interval_end
         """
     ).df()
@@ -200,7 +231,7 @@ def time_interval_table_for_interval_tables_query_long_format(
     df = backend_handle.execute(
         f"""
         WITH person_time_defining_table AS ( \
-            SELECT person.person_id as person_id, {START_DATE_KEY[time_defining_table]} as start_date, {END_DATE_KEY[time_defining_table]} as end_date \
+            SELECT person.person_id as person_id, {DATA_TABLE_DATE_KEYS["start"][time_defining_table]} as start_date, {DATA_TABLE_DATE_KEYS["end"][time_defining_table]} as end_date \
             FROM person \
             JOIN {time_defining_table} ON person.person_id = {time_defining_table}.{TIME_DEFINING_TABLE_SUBJECT_KEY[time_defining_table]} \
         ), \
@@ -231,9 +262,9 @@ def time_interval_table_for_interval_tables_query_long_format(
         FROM long_format_intervals as lfi \
         LEFT JOIN data_table_with_presence_indicator ON lfi.person_id = data_table_with_presence_indicator.person_id \
                 AND lfi.data_table_concept_id = data_table_with_presence_indicator.{DATA_TABLE_CONCEPT_ID_TRUNK[data_table]}_concept_id \
-                AND (data_table_with_presence_indicator.{DATA_TABLE_DATE_TRUNK[data_table]}_start_date BETWEEN lfi.interval_start AND lfi.interval_end \
-                    OR data_table_with_presence_indicator.{DATA_TABLE_DATE_TRUNK[data_table]}_end_date BETWEEN lfi.interval_start AND lfi.interval_end \
-                    OR (data_table_with_presence_indicator.{DATA_TABLE_DATE_TRUNK[data_table]}_start_date < lfi.interval_start AND data_table_with_presence_indicator.{DATA_TABLE_DATE_TRUNK[data_table]}_end_date > lfi.interval_end)) \
+                AND (data_table_with_presence_indicator.{DATA_TABLE_DATE_KEYS["start"][data_table]} BETWEEN lfi.interval_start AND lfi.interval_end \
+                    OR data_table_with_presence_indicator.{DATA_TABLE_DATE_KEYS["end"][data_table]} BETWEEN lfi.interval_start AND lfi.interval_end \
+                    OR (data_table_with_presence_indicator.{DATA_TABLE_DATE_KEYS["start"][data_table]} < lfi.interval_start AND data_table_with_presence_indicator.{DATA_TABLE_DATE_KEYS["end"][data_table]} > lfi.interval_end)) \
         GROUP BY lfi.person_id, lfi.data_table_concept_id, interval_step, interval_start, interval_end
         """
     ).df()
diff --git a/tests/data/toy_omop/vanilla/procedure_occurrence.csv b/tests/data/toy_omop/vanilla/procedure_occurrence.csv
new file mode 100644
index 0000000..90fe7b1
--- /dev/null
+++ b/tests/data/toy_omop/vanilla/procedure_occurrence.csv
@@ -0,0 +1,10 @@
+procedure_occurrence_id,person_id,procedure_concept_id,procedure_date,procedure_datetime,procedure_end_date,procedure_end_datetime,procedure_type_concept_id,modifier_concept_id,quantity,provider_id,visit_occurrence_id,visit_detail_id,procedure_source_value,procedure_source_concept_id,modifier_source_value
+1,1,4326177,2100-01-01,2100-01-01 12:00:00,2100-01-31,2100-01-31 00:00:00,38000267,0,,0,31,,430193006,1121000119107,
+2,1,4326177,2100-02-01,2100-02-01 12:00:00,2100-02-28,2100-02-28 00:00:00,38000267,0,,0,28,,430193006,1121000119107,
+3,1,4107731,2100-01-01,2100-01-01 12:00:00,2100-01-31,2100-01-31 00:00:00,38000267,0,,0,31,,180256009,4107731,
+4,2,4326177,2100-01-01,2100-01-01 12:00:00,2100-01-31,2100-01-31 00:00:00,38000267,0,,0,31,,430193006,1121000119107,
+5,2,4326177,2100-02-01,2100-02-01 12:00:00,2100-02-28,2100-02-28 00:00:00,38000267,0,,0,28,,430193006,1121000119107,
+6,2,4107731,2100-01-01,2100-01-01 12:00:00,2100-01-31,2100-01-31 00:00:00,38000267,0,,0,31,,180256009,4107731,
+7,3,4326177,2100-01-01,2100-01-01 12:00:00,2100-01-31,2100-01-31 00:00:00,38000267,0,,0,31,,430193006,1121000119107,
+8,3,4326177,2100-02-01,2100-02-01 12:00:00,2100-02-28,2100-02-28 00:00:00,38000267,0,,0,28,,430193006,1121000119107,
+9,3,4107731,2100-01-01,2100-01-01 12:00:00,2100-01-31,2100-01-31 00:00:00,38000267,0,,0,31,,180256009,4107731,
diff --git a/tests/test_io/test_omop.py b/tests/test_io/test_omop.py
index 6bc101d..e8d503c 100644
--- a/tests/test_io/test_omop.py
+++ b/tests/test_io/test_omop.py
@@ -17,6 +17,7 @@
     "specimen": 2,
     "drug_exposure": 2,
     "condition_occurrence": 2,
+    "procedure_occurrence": 2,
 }
 
 # constants for setup_variables
@@ -322,7 +323,66 @@ def test_setup_variables(
                 [[1, 1, 1, 1], [1, 1, 1, 1]],
             ],
         ),
-        # (["procedure_occurrence"], ["is_present"]), # TODO: write test file
+        (
+            ["procedure_occurrence"],
+            ["procedure_source_value"],
+            "start",
+            [
+                [[180256009, np.nan, np.nan, np.nan], [430193006, np.nan, np.nan, np.nan]],
+                [[180256009, np.nan, np.nan, np.nan], [430193006, np.nan, np.nan, np.nan]],
+                [[180256009, np.nan, np.nan, np.nan], [430193006, np.nan, np.nan, np.nan]],
+            ],
+        ),
+        (
+            ["procedure_occurrence"],
+            ["procedure_source_value"],
+            "end",
+            [
+                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
+                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
+                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
+            ],
+        ),
+        (
+            ["procedure_occurrence"],
+            ["procedure_source_value"],
+            "interval",
+            [
+                [[180256009, 180256009, 180256009, 180256009], [430193006, 430193006, 430193006, 430193006]],
+                [[180256009, 180256009, 180256009, 180256009], [430193006, 430193006, 430193006, 430193006]],
+                [[180256009, 180256009, 180256009, 180256009], [430193006, 430193006, 430193006, 430193006]],
+            ],
+        ),
+        (
+            ["procedure_occurrence"],
+            ["is_present"],
+            "start",
+            [
+                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
+                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
+                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
+            ],
+        ),
+        (
+            ["procedure_occurrence"],
+            ["is_present"],
+            "end",
+            [
+                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
+                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
+                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
+            ],
+        ),
+        (
+            ["procedure_occurrence"],
+            ["is_present"],
+            "interval",
+            [
+                [[1, 1, 1, 1], [1, 1, 1, 1]],
+                [[1, 1, 1, 1], [1, 1, 1, 1]],
+                [[1, 1, 1, 1], [1, 1, 1, 1]],
+            ],
+        ),
         # (["device_exposure"], ["is_present"]), # TODO: write test file
         # (["note"], ["is_present"]),
     ],

From 6b0e57ebe5c5b5cadd08f5108bdc816257f70cbe Mon Sep 17 00:00:00 2001
From: Eljas Roellin <eljas.roellin@ikmail.com>
Date: Fri, 22 Nov 2024 14:05:47 +0100
Subject: [PATCH 38/43] support device_exposure

---
 src/ehrdata/io/omop/_check_arguments.py       |  7 ++-
 src/ehrdata/io/omop/_queries.py               |  3 +
 .../data/toy_omop/vanilla/device_exposure.csv | 10 +++
 tests/test_io/test_omop.py                    | 62 ++++++++++++++++++-
 4 files changed, 80 insertions(+), 2 deletions(-)
 create mode 100644 tests/data/toy_omop/vanilla/device_exposure.csv

diff --git a/src/ehrdata/io/omop/_check_arguments.py b/src/ehrdata/io/omop/_check_arguments.py
index fcd2d78..5da1737 100644
--- a/src/ehrdata/io/omop/_check_arguments.py
+++ b/src/ehrdata/io/omop/_check_arguments.py
@@ -12,7 +12,12 @@
 VALID_OBSERVATION_TABLES_SINGLE = ["person"]
 VALID_OBSERVATION_TABLES_JOIN = ["person_cohort", "person_observation_period", "person_visit_occurrence"]
 VALID_VARIABLE_TABLES = ["measurement", "observation", "specimen"]
-VALID_INTERVAL_VARIABLE_TABLES = ["drug_exposure", "condition_occurrence", "procedure_occurrence"]
+VALID_INTERVAL_VARIABLE_TABLES = [
+    "drug_exposure",
+    "condition_occurrence",
+    "procedure_occurrence",
+    "device_exposure",
+]
 VALID_KEEP_DATES = ["start", "end", "interval"]
 
 
diff --git a/src/ehrdata/io/omop/_queries.py b/src/ehrdata/io/omop/_queries.py
index 87eab08..3e15f47 100644
--- a/src/ehrdata/io/omop/_queries.py
+++ b/src/ehrdata/io/omop/_queries.py
@@ -32,6 +32,7 @@
     "drug_exposure": "drug",
     "condition_occurrence": "condition",
     "procedure_occurrence": "procedure",
+    "device_exposure": "device",
 }
 # DATA_TABLE_DATE_TRUNK = {
 #     "measurement": "measurement",
@@ -55,6 +56,7 @@
         "drug_exposure": "drug_exposure_start_date",
         "condition_occurrence": "condition_start_date",
         "procedure_occurrence": "procedure_date",  # in v5.3, procedure didnt have end date
+        "device_exposure": "device_exposure_start_date",
     },
     "end": {
         "visit_occurrence": "visit_end_date",
@@ -63,6 +65,7 @@
         "drug_exposure": "drug_exposure_end_date",
         "condition_occurrence": "condition_end_date",
         "procedure_occurrence": "procedure_end_date",  # in v5.3, procedure didnt have end date TODO v5.3 support
+        "device_exposure": "device_exposure_end_date",
     },
 }
 
diff --git a/tests/data/toy_omop/vanilla/device_exposure.csv b/tests/data/toy_omop/vanilla/device_exposure.csv
new file mode 100644
index 0000000..d84c862
--- /dev/null
+++ b/tests/data/toy_omop/vanilla/device_exposure.csv
@@ -0,0 +1,10 @@
+device_exposure_id,person_id,device_concept_id,device_exposure_start_date,device_exposure_start_datetime,device_exposure_end_date,device_exposure_end_datetime,device_type_concept_id,unique_device_id,production_id,quantity,provider_id,visit_occurrence_id,visit_detail_id,device_source_value,device_source_concept_id,unit_concept_id,unit_source_value,unit_source_concept_id
+1,1,4217646,2100-01-01,2100-01-01 12:00:00,2100-01-31,2100-01-31 00:00:00,38000267,,,,,,,72506001,4217646,,,
+2,1,4217646,2100-02-01,2100-02-01 12:00:00,2100-02-28,2100-02-28 00:00:00,38000267,,,,,,,72506001,4217646,,,
+3,1,45768171,2100-01-01,2100-01-01 12:00:00,2100-01-31,2100-01-31 00:00:00,32817,,,,,,,224087,2000030021,,,
+4,2,4217646,2100-01-01,2100-01-01 12:00:00,2100-01-31,2100-01-31 00:00:00,38000267,,,,,,,72506001,4217646,,,
+5,2,4217646,2100-02-01,2100-02-01 12:00:00,2100-02-28,2100-02-28 00:00:00,38000267,,,,,,,72506001,4217646,,,
+6,2,45768171,2100-01-01,2100-01-01 12:00:00,2100-01-31,2100-01-31 00:00:00,32817,,,,,,,224087,2000030021,,,
+7,3,4217646,2100-01-01,2100-01-01 12:00:00,2100-01-31,2100-01-31 00:00:00,38000267,,,,,,,72506001,4217646,,,
+8,3,4217646,2100-02-01,2100-02-01 12:00:00,2100-02-28,2100-02-28 00:00:00,38000267,,,,,,,72506001,4217646,,,
+9,3,45768171,2100-01-01,2100-01-01 12:00:00,2100-01-31,2100-01-31 00:00:00,32817,,,,,,,224087,2000030021,,,
diff --git a/tests/test_io/test_omop.py b/tests/test_io/test_omop.py
index e8d503c..911e967 100644
--- a/tests/test_io/test_omop.py
+++ b/tests/test_io/test_omop.py
@@ -18,6 +18,7 @@
     "drug_exposure": 2,
     "condition_occurrence": 2,
     "procedure_occurrence": 2,
+    "device_exposure": 2,
 }
 
 # constants for setup_variables
@@ -383,7 +384,66 @@ def test_setup_variables(
                 [[1, 1, 1, 1], [1, 1, 1, 1]],
             ],
         ),
-        # (["device_exposure"], ["is_present"]), # TODO: write test file
+        (
+            ["device_exposure"],
+            ["device_source_value"],
+            "start",
+            [
+                [[72506001, np.nan, np.nan, np.nan], [224087, np.nan, np.nan, np.nan]],
+                [[72506001, np.nan, np.nan, np.nan], [224087, np.nan, np.nan, np.nan]],
+                [[72506001, np.nan, np.nan, np.nan], [224087, np.nan, np.nan, np.nan]],
+            ],
+        ),
+        (
+            ["device_exposure"],
+            ["device_source_value"],
+            "end",
+            [
+                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
+                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
+                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
+            ],
+        ),
+        (
+            ["device_exposure"],
+            ["device_source_value"],
+            "interval",
+            [
+                [[72506001, 72506001, 72506001, 72506001], [224087, 224087, 224087, 224087]],
+                [[72506001, 72506001, 72506001, 72506001], [224087, 224087, 224087, 224087]],
+                [[72506001, 72506001, 72506001, 72506001], [224087, 224087, 224087, 224087]],
+            ],
+        ),
+        (
+            ["device_exposure"],
+            ["is_present"],
+            "start",
+            [
+                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
+                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
+                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
+            ],
+        ),
+        (
+            ["device_exposure"],
+            ["is_present"],
+            "end",
+            [
+                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
+                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
+                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
+            ],
+        ),
+        (
+            ["device_exposure"],
+            ["is_present"],
+            "interval",
+            [
+                [[1, 1, 1, 1], [1, 1, 1, 1]],
+                [[1, 1, 1, 1], [1, 1, 1, 1]],
+                [[1, 1, 1, 1], [1, 1, 1, 1]],
+            ],
+        ),
         # (["note"], ["is_present"]),
     ],
 )

From 9dc6637fff6eeaafc0f98831195c73192338addd Mon Sep 17 00:00:00 2001
From: Eljas Roellin <eljas.roellin@ikmail.com>
Date: Fri, 22 Nov 2024 14:36:37 +0100
Subject: [PATCH 39/43] support drug_era

---
 src/ehrdata/io/omop/_check_arguments.py  |  1 +
 src/ehrdata/io/omop/_queries.py          |  3 ++
 tests/data/toy_omop/vanilla/drug_era.csv | 10 ++++
 tests/test_io/test_omop.py               | 62 +++++++++++++++++++++++-
 4 files changed, 75 insertions(+), 1 deletion(-)
 create mode 100644 tests/data/toy_omop/vanilla/drug_era.csv

diff --git a/src/ehrdata/io/omop/_check_arguments.py b/src/ehrdata/io/omop/_check_arguments.py
index 5da1737..b53e79f 100644
--- a/src/ehrdata/io/omop/_check_arguments.py
+++ b/src/ehrdata/io/omop/_check_arguments.py
@@ -17,6 +17,7 @@
     "condition_occurrence",
     "procedure_occurrence",
     "device_exposure",
+    "drug_era",
 ]
 VALID_KEEP_DATES = ["start", "end", "interval"]
 
diff --git a/src/ehrdata/io/omop/_queries.py b/src/ehrdata/io/omop/_queries.py
index 3e15f47..4684633 100644
--- a/src/ehrdata/io/omop/_queries.py
+++ b/src/ehrdata/io/omop/_queries.py
@@ -33,6 +33,7 @@
     "condition_occurrence": "condition",
     "procedure_occurrence": "procedure",
     "device_exposure": "device",
+    "drug_era": "drug",
 }
 # DATA_TABLE_DATE_TRUNK = {
 #     "measurement": "measurement",
@@ -57,6 +58,7 @@
         "condition_occurrence": "condition_start_date",
         "procedure_occurrence": "procedure_date",  # in v5.3, procedure didnt have end date
         "device_exposure": "device_exposure_start_date",
+        "drug_era": "drug_era_start_date",
     },
     "end": {
         "visit_occurrence": "visit_end_date",
@@ -66,6 +68,7 @@
         "condition_occurrence": "condition_end_date",
         "procedure_occurrence": "procedure_end_date",  # in v5.3, procedure didnt have end date TODO v5.3 support
         "device_exposure": "device_exposure_end_date",
+        "drug_era": "drug_era_end_date",
     },
 }
 
diff --git a/tests/data/toy_omop/vanilla/drug_era.csv b/tests/data/toy_omop/vanilla/drug_era.csv
new file mode 100644
index 0000000..a2dbf2c
--- /dev/null
+++ b/tests/data/toy_omop/vanilla/drug_era.csv
@@ -0,0 +1,10 @@
+drug_era_id,person_id,drug_concept_id,drug_era_start_date,drug_era_end_date,drug_exposure_count,gap_days
+1,1,1124957,2100-01-01,2100-01-31,2,1
+2,1,1124957,2100-02-01,2100-02-28,2,1
+3,1,1368671,2100-01-01,2100-01-31,4,3
+4,2,1124957,2100-01-01,2100-01-31,2,1
+5,2,1124957,2100-02-01,2100-02-28,2,1
+6,2,1368671,2100-01-01,2100-01-31,4,3
+7,3,1124957,2100-01-01,2100-01-31,2,2
+8,3,1124957,2100-02-01,2100-02-28,2,1
+9,3,1368671,2100-01-01,2100-01-31,4,3
diff --git a/tests/test_io/test_omop.py b/tests/test_io/test_omop.py
index 911e967..f8ea0af 100644
--- a/tests/test_io/test_omop.py
+++ b/tests/test_io/test_omop.py
@@ -19,6 +19,7 @@
     "condition_occurrence": 2,
     "procedure_occurrence": 2,
     "device_exposure": 2,
+    "drug_era": 2,
 }
 
 # constants for setup_variables
@@ -444,7 +445,66 @@ def test_setup_variables(
                 [[1, 1, 1, 1], [1, 1, 1, 1]],
             ],
         ),
-        # (["note"], ["is_present"]),
+        (
+            ["drug_era"],
+            ["drug_exposure_count"],
+            "start",
+            [
+                [[2, np.nan, np.nan, np.nan], [4, np.nan, np.nan, np.nan]],
+                [[2, np.nan, np.nan, np.nan], [4, np.nan, np.nan, np.nan]],
+                [[2, np.nan, np.nan, np.nan], [4, np.nan, np.nan, np.nan]],
+            ],
+        ),
+        (
+            ["drug_era"],
+            ["drug_exposure_count"],
+            "end",
+            [
+                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
+                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
+                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
+            ],
+        ),
+        (
+            ["drug_era"],
+            ["drug_exposure_count"],
+            "interval",
+            [
+                [[2, 2, 2, 2], [4, 4, 4, 4]],
+                [[2, 2, 2, 2], [4, 4, 4, 4]],
+                [[2, 2, 2, 2], [4, 4, 4, 4]],
+            ],
+        ),
+        (
+            ["drug_era"],
+            ["is_present"],
+            "start",
+            [
+                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
+                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
+                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
+            ],
+        ),
+        (
+            ["drug_era"],
+            ["is_present"],
+            "end",
+            [
+                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
+                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
+                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
+            ],
+        ),
+        (
+            ["drug_era"],
+            ["is_present"],
+            "interval",
+            [
+                [[1, 1, 1, 1], [1, 1, 1, 1]],
+                [[1, 1, 1, 1], [1, 1, 1, 1]],
+                [[1, 1, 1, 1], [1, 1, 1, 1]],
+            ],
+        ),
     ],
 )
 @pytest.mark.parametrize(

From 0525855ba8de02acdb32a58e97730269c571af46 Mon Sep 17 00:00:00 2001
From: Eljas Roellin <eljas.roellin@ikmail.com>
Date: Fri, 22 Nov 2024 15:06:27 +0100
Subject: [PATCH 40/43] support dose_era

---
 src/ehrdata/io/omop/_check_arguments.py  |  1 +
 src/ehrdata/io/omop/_queries.py          | 27 ++---------
 tests/data/toy_omop/vanilla/dose_era.csv | 10 ++++
 tests/test_io/test_omop.py               | 61 ++++++++++++++++++++++++
 4 files changed, 75 insertions(+), 24 deletions(-)
 create mode 100644 tests/data/toy_omop/vanilla/dose_era.csv

diff --git a/src/ehrdata/io/omop/_check_arguments.py b/src/ehrdata/io/omop/_check_arguments.py
index b53e79f..ad637d6 100644
--- a/src/ehrdata/io/omop/_check_arguments.py
+++ b/src/ehrdata/io/omop/_check_arguments.py
@@ -18,6 +18,7 @@
     "procedure_occurrence",
     "device_exposure",
     "drug_era",
+    "dose_era",
 ]
 VALID_KEEP_DATES = ["start", "end", "interval"]
 
diff --git a/src/ehrdata/io/omop/_queries.py b/src/ehrdata/io/omop/_queries.py
index 4684633..3a4d002 100644
--- a/src/ehrdata/io/omop/_queries.py
+++ b/src/ehrdata/io/omop/_queries.py
@@ -3,22 +3,6 @@
 import duckdb
 import pandas as pd
 
-# START_DATE_KEY = {
-#     "visit_occurrence": "visit_start_date",
-#     "observation_period": "observation_period_start_date",
-#     "cohort": "cohort_start_date",
-#     "drug_exposure": "drug_exposure_start_date",
-#     "condition_occurrence": "condition_start_date",
-#     "procedure_occurrence": "procedure_date",  # v5.4, as v5.3 had no end! TODO: allow v5.3 too
-# }
-# END_DATE_KEY = {
-#     "visit_occurrence": "visit_end_date",
-#     "observation_period": "observation_period_end_date",
-#     "cohort": "cohort_end_date",
-#     "drug_exposure": "drug_exposure_end_date",
-#     "condition_occurrence": "condition_end_date",
-#     "procedure_occurrence": "procedure_end_date",  # v5.4, as v5.3 had no end! TODO: allow v5.3 too
-# }
 TIME_DEFINING_TABLE_SUBJECT_KEY = {
     "visit_occurrence": "person_id",
     "observation_period": "person_id",
@@ -34,15 +18,8 @@
     "procedure_occurrence": "procedure",
     "device_exposure": "device",
     "drug_era": "drug",
+    "dose_era": "drug",
 }
-# DATA_TABLE_DATE_TRUNK = {
-#     "measurement": "measurement",
-#     "observation": "observation",
-#     "specimen": "specimen",
-#     "drug_exposure": "drug_exposure",
-#     "condition_occurrence": "condition",
-#     "procedure_occurrence": "procedure",
-# }
 
 DATA_TABLE_DATE_KEYS = {
     "timepoint": {
@@ -59,6 +36,7 @@
         "procedure_occurrence": "procedure_date",  # in v5.3, procedure didnt have end date
         "device_exposure": "device_exposure_start_date",
         "drug_era": "drug_era_start_date",
+        "dose_era": "dose_era_start_date",
     },
     "end": {
         "visit_occurrence": "visit_end_date",
@@ -69,6 +47,7 @@
         "procedure_occurrence": "procedure_end_date",  # in v5.3, procedure didnt have end date TODO v5.3 support
         "device_exposure": "device_exposure_end_date",
         "drug_era": "drug_era_end_date",
+        "dose_era": "dose_era_end_date",
     },
 }
 
diff --git a/tests/data/toy_omop/vanilla/dose_era.csv b/tests/data/toy_omop/vanilla/dose_era.csv
new file mode 100644
index 0000000..b2c1bf3
--- /dev/null
+++ b/tests/data/toy_omop/vanilla/dose_era.csv
@@ -0,0 +1,10 @@
+dose_era_id,person_id,drug_concept_id,unit_concept_id,dose_value,dose_era_start_date,dose_era_end_date
+1,1,902427,8576,10,2100-01-01,2100-01-31
+2,1,902427,8576,10,2100-02-01,2100-02-28
+3,1,714785,8576,2.5,2100-01-01,2100-01-31
+4,2,902427,8576,10,2100-01-01,2100-01-31
+5,2,902427,8576,10,2100-02-01,2100-02-28
+6,2,714785,8576,2.5,2100-01-01,2100-01-31
+7,3,902427,8576,10,2100-01-01,2100-01-31
+8,3,902427,8576,10,2100-02-01,2100-02-28
+9,3,714785,8576,2.5,2100-01-01,2100-01-31
diff --git a/tests/test_io/test_omop.py b/tests/test_io/test_omop.py
index f8ea0af..2a2c205 100644
--- a/tests/test_io/test_omop.py
+++ b/tests/test_io/test_omop.py
@@ -20,6 +20,7 @@
     "procedure_occurrence": 2,
     "device_exposure": 2,
     "drug_era": 2,
+    "dose_era": 2,
 }
 
 # constants for setup_variables
@@ -505,6 +506,66 @@ def test_setup_variables(
                 [[1, 1, 1, 1], [1, 1, 1, 1]],
             ],
         ),
+        (
+            ["dose_era"],
+            ["dose_value"],
+            "start",
+            [
+                [[2.5, np.nan, np.nan, np.nan], [10, np.nan, np.nan, np.nan]],
+                [[2.5, np.nan, np.nan, np.nan], [10, np.nan, np.nan, np.nan]],
+                [[2.5, np.nan, np.nan, np.nan], [10, np.nan, np.nan, np.nan]],
+            ],
+        ),
+        (
+            ["dose_era"],
+            ["dose_value"],
+            "end",
+            [
+                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
+                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
+                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
+            ],
+        ),
+        (
+            ["dose_era"],
+            ["dose_value"],
+            "interval",
+            [
+                [[2.5, 2.5, 2.5, 2.5], [10, 10, 10, 10]],
+                [[2.5, 2.5, 2.5, 2.5], [10, 10, 10, 10]],
+                [[2.5, 2.5, 2.5, 2.5], [10, 10, 10, 10]],
+            ],
+        ),
+        (
+            ["dose_era"],
+            ["is_present"],
+            "start",
+            [
+                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
+                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
+                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
+            ],
+        ),
+        (
+            ["dose_era"],
+            ["is_present"],
+            "end",
+            [
+                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
+                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
+                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
+            ],
+        ),
+        (
+            ["dose_era"],
+            ["is_present"],
+            "interval",
+            [
+                [[1, 1, 1, 1], [1, 1, 1, 1]],
+                [[1, 1, 1, 1], [1, 1, 1, 1]],
+                [[1, 1, 1, 1], [1, 1, 1, 1]],
+            ],
+        ),
     ],
 )
 @pytest.mark.parametrize(

From e4ad1dbb6517dd3a02fd5b306ec88a9a115e44df Mon Sep 17 00:00:00 2001
From: Eljas Roellin <eljas.roellin@ikmail.com>
Date: Fri, 22 Nov 2024 15:23:20 +0100
Subject: [PATCH 41/43] support condition_era

---
 src/ehrdata/io/omop/_check_arguments.py       |  1 +
 src/ehrdata/io/omop/_queries.py               |  3 +
 tests/data/toy_omop/vanilla/condition_era.csv | 10 +++
 tests/test_io/test_omop.py                    | 61 +++++++++++++++++++
 4 files changed, 75 insertions(+)
 create mode 100644 tests/data/toy_omop/vanilla/condition_era.csv

diff --git a/src/ehrdata/io/omop/_check_arguments.py b/src/ehrdata/io/omop/_check_arguments.py
index ad637d6..5eb9528 100644
--- a/src/ehrdata/io/omop/_check_arguments.py
+++ b/src/ehrdata/io/omop/_check_arguments.py
@@ -19,6 +19,7 @@
     "device_exposure",
     "drug_era",
     "dose_era",
+    "condition_era",
 ]
 VALID_KEEP_DATES = ["start", "end", "interval"]
 
diff --git a/src/ehrdata/io/omop/_queries.py b/src/ehrdata/io/omop/_queries.py
index 3a4d002..0885726 100644
--- a/src/ehrdata/io/omop/_queries.py
+++ b/src/ehrdata/io/omop/_queries.py
@@ -19,6 +19,7 @@
     "device_exposure": "device",
     "drug_era": "drug",
     "dose_era": "drug",
+    "condition_era": "condition",
 }
 
 DATA_TABLE_DATE_KEYS = {
@@ -37,6 +38,7 @@
         "device_exposure": "device_exposure_start_date",
         "drug_era": "drug_era_start_date",
         "dose_era": "dose_era_start_date",
+        "condition_era": "condition_era_start_date",
     },
     "end": {
         "visit_occurrence": "visit_end_date",
@@ -48,6 +50,7 @@
         "device_exposure": "device_exposure_end_date",
         "drug_era": "drug_era_end_date",
         "dose_era": "dose_era_end_date",
+        "condition_era": "condition_era_end_date",
     },
 }
 
diff --git a/tests/data/toy_omop/vanilla/condition_era.csv b/tests/data/toy_omop/vanilla/condition_era.csv
new file mode 100644
index 0000000..f5c2cf0
--- /dev/null
+++ b/tests/data/toy_omop/vanilla/condition_era.csv
@@ -0,0 +1,10 @@
+condition_era_id,person_id,condition_concept_id,condition_era_start_date,condition_era_end_date,condition_occurrence_count
+1,1,4140598,2100-01-01,2100-01-31,256
+2,1,4140598,2100-02-01,2100-02-28,256
+3,1,434610,2100-01-01,2100-01-31,1
+4,2,4140598,2100-01-01,2100-01-31,256
+5,2,4140598,2100-02-01,2100-02-28,256
+6,2,434610,2100-01-01,2100-01-31,1
+7,3,4140598,2100-01-01,2100-01-31,256
+8,3,4140598,2100-02-01,2100-02-28,256
+9,3,434610,2100-01-01,2100-01-31,1
diff --git a/tests/test_io/test_omop.py b/tests/test_io/test_omop.py
index 2a2c205..f649180 100644
--- a/tests/test_io/test_omop.py
+++ b/tests/test_io/test_omop.py
@@ -21,6 +21,7 @@
     "device_exposure": 2,
     "drug_era": 2,
     "dose_era": 2,
+    "condition_era": 2,
 }
 
 # constants for setup_variables
@@ -566,6 +567,66 @@ def test_setup_variables(
                 [[1, 1, 1, 1], [1, 1, 1, 1]],
             ],
         ),
+        (
+            ["condition_era"],
+            ["condition_occurrence_count"],
+            "start",
+            [
+                [[1, np.nan, np.nan, np.nan], [256, np.nan, np.nan, np.nan]],
+                [[1, np.nan, np.nan, np.nan], [256, np.nan, np.nan, np.nan]],
+                [[1, np.nan, np.nan, np.nan], [256, np.nan, np.nan, np.nan]],
+            ],
+        ),
+        (
+            ["condition_era"],
+            ["condition_occurrence_count"],
+            "end",
+            [
+                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
+                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
+                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
+            ],
+        ),
+        (
+            ["condition_era"],
+            ["condition_occurrence_count"],
+            "interval",
+            [
+                [[1, 1, 1, 1], [256, 256, 256, 256]],
+                [[1, 1, 1, 1], [256, 256, 256, 256]],
+                [[1, 1, 1, 1], [256, 256, 256, 256]],
+            ],
+        ),
+        (
+            ["condition_era"],
+            ["is_present"],
+            "start",
+            [
+                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
+                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
+                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
+            ],
+        ),
+        (
+            ["condition_era"],
+            ["is_present"],
+            "end",
+            [
+                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
+                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
+                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
+            ],
+        ),
+        (
+            ["condition_era"],
+            ["is_present"],
+            "interval",
+            [
+                [[1, 1, 1, 1], [1, 1, 1, 1]],
+                [[1, 1, 1, 1], [1, 1, 1, 1]],
+                [[1, 1, 1, 1], [1, 1, 1, 1]],
+            ],
+        ),
     ],
 )
 @pytest.mark.parametrize(

From 623ebab2a0db365200d9cc93c4daaa4263f2ccc6 Mon Sep 17 00:00:00 2001
From: Eljas Roellin <eljas.roellin@ikmail.com>
Date: Fri, 22 Nov 2024 16:13:21 +0100
Subject: [PATCH 42/43] support episode

---
 src/ehrdata/io/omop/_check_arguments.py |  1 +
 src/ehrdata/io/omop/_queries.py         |  3 ++
 src/ehrdata/utils/_omop_utils.py        |  1 +
 tests/data/toy_omop/vanilla/episode.csv | 10 ++++
 tests/test_dt/test_dt.py                |  2 +-
 tests/test_io/test_omop.py              | 61 +++++++++++++++++++++++++
 6 files changed, 77 insertions(+), 1 deletion(-)
 create mode 100644 tests/data/toy_omop/vanilla/episode.csv

diff --git a/src/ehrdata/io/omop/_check_arguments.py b/src/ehrdata/io/omop/_check_arguments.py
index 5eb9528..9a5930f 100644
--- a/src/ehrdata/io/omop/_check_arguments.py
+++ b/src/ehrdata/io/omop/_check_arguments.py
@@ -20,6 +20,7 @@
     "drug_era",
     "dose_era",
     "condition_era",
+    "episode",
 ]
 VALID_KEEP_DATES = ["start", "end", "interval"]
 
diff --git a/src/ehrdata/io/omop/_queries.py b/src/ehrdata/io/omop/_queries.py
index 0885726..2cdaa34 100644
--- a/src/ehrdata/io/omop/_queries.py
+++ b/src/ehrdata/io/omop/_queries.py
@@ -20,6 +20,7 @@
     "drug_era": "drug",
     "dose_era": "drug",
     "condition_era": "condition",
+    "episode": "episode",
 }
 
 DATA_TABLE_DATE_KEYS = {
@@ -39,6 +40,7 @@
         "drug_era": "drug_era_start_date",
         "dose_era": "dose_era_start_date",
         "condition_era": "condition_era_start_date",
+        "episode": "episode_start_date",
     },
     "end": {
         "visit_occurrence": "visit_end_date",
@@ -51,6 +53,7 @@
         "drug_era": "drug_era_end_date",
         "dose_era": "dose_era_end_date",
         "condition_era": "condition_era_end_date",
+        "episode": "episode_end_date",
     },
 }
 
diff --git a/src/ehrdata/utils/_omop_utils.py b/src/ehrdata/utils/_omop_utils.py
index 2b52d02..c2b1834 100644
--- a/src/ehrdata/utils/_omop_utils.py
+++ b/src/ehrdata/utils/_omop_utils.py
@@ -43,6 +43,7 @@ def get_table_catalog_dict(version: Literal["5.4"] = "5.4"):
         "note_nlp",
         "observation",
         "fact_relationship",
+        "episode",
     ]
 
     table_catalog_dict["Health system data"] = ["location", "care_site", "provider"]
diff --git a/tests/data/toy_omop/vanilla/episode.csv b/tests/data/toy_omop/vanilla/episode.csv
new file mode 100644
index 0000000..59ecf81
--- /dev/null
+++ b/tests/data/toy_omop/vanilla/episode.csv
@@ -0,0 +1,10 @@
+episode_id,person_id,episode_concept_id,episode_start_date,episode_start_datetime,episode_end_date,episode_end_datetime,episode_parent_id,episode_number,episode_object_concept_id,episode_type_concept_id,episode_source_value,episode_source_concept_id
+1,1,32941,2100-01-01,2100-01-01 12:00:00,2100-01-31,2100-01-31 00:00:00,,,,,10,
+2,1,32941,2100-02-01,2100-02-01 12:00:00,2100-02-28,2100-02-28 00:00:00,,,,,10,
+3,1,32531,2100-01-01,2100-01-01 12:00:00,2100-01-31,2100-01-31 00:00:00,,,,,5,
+4,2,32941,2100-01-01,2100-01-01 12:00:00,2100-01-31,2100-01-31 00:00:00,,,,,10,
+5,2,32941,2100-02-01,2100-02-01 12:00:00,2100-02-28,2100-02-28 00:00:00,,,,,10,
+6,2,32531,2100-01-01,2100-01-01 12:00:00,2100-01-31,2100-01-31 00:00:00,,,,,5,
+7,3,32941,2100-01-01,2100-01-01 12:00:00,2100-01-31,2100-01-31 00:00:00,,,,,10,
+8,3,32941,2100-02-01,2100-02-01 12:00:00,2100-02-28,2100-02-28 00:00:00,,,,,10,
+9,3,32531,2100-01-01,2100-01-01 12:00:00,2100-01-31,2100-01-31 00:00:00,,,,,5,
diff --git a/tests/test_dt/test_dt.py b/tests/test_dt/test_dt.py
index 02fb030..65accff 100644
--- a/tests/test_dt/test_dt.py
+++ b/tests/test_dt/test_dt.py
@@ -34,7 +34,7 @@ def test_gibleed_omop(tmp_path):
 def test_synthea27nj_omop(tmp_path):
     duckdb_connection = duckdb.connect()
     ed.dt.synthea27nj_omop(data_path=tmp_path, backend_handle=duckdb_connection)
-    assert len(duckdb_connection.execute("SHOW TABLES").df()) == 37
+    assert len(duckdb_connection.execute("SHOW TABLES").df()) == 38
     # sanity check of one table
     assert duckdb_connection.execute("SELECT * FROM person").df().shape == (28, 18)
     duckdb_connection.close()
diff --git a/tests/test_io/test_omop.py b/tests/test_io/test_omop.py
index f649180..5e77731 100644
--- a/tests/test_io/test_omop.py
+++ b/tests/test_io/test_omop.py
@@ -22,6 +22,7 @@
     "drug_era": 2,
     "dose_era": 2,
     "condition_era": 2,
+    "episode": 2,
 }
 
 # constants for setup_variables
@@ -627,6 +628,66 @@ def test_setup_variables(
                 [[1, 1, 1, 1], [1, 1, 1, 1]],
             ],
         ),
+        (
+            ["episode"],
+            ["episode_source_value"],
+            "start",
+            [
+                [[5, np.nan, np.nan, np.nan], [10, np.nan, np.nan, np.nan]],
+                [[5, np.nan, np.nan, np.nan], [10, np.nan, np.nan, np.nan]],
+                [[5, np.nan, np.nan, np.nan], [10, np.nan, np.nan, np.nan]],
+            ],
+        ),
+        (
+            ["episode"],
+            ["episode_source_value"],
+            "end",
+            [
+                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
+                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
+                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
+            ],
+        ),
+        (
+            ["episode"],
+            ["episode_source_value"],
+            "interval",
+            [
+                [[5, 5, 5, 5], [10, 10, 10, 10]],
+                [[5, 5, 5, 5], [10, 10, 10, 10]],
+                [[5, 5, 5, 5], [10, 10, 10, 10]],
+            ],
+        ),
+        (
+            ["episode"],
+            ["is_present"],
+            "start",
+            [
+                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
+                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
+                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
+            ],
+        ),
+        (
+            ["episode"],
+            ["is_present"],
+            "end",
+            [
+                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
+                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
+                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
+            ],
+        ),
+        (
+            ["episode"],
+            ["is_present"],
+            "interval",
+            [
+                [[1, 1, 1, 1], [1, 1, 1, 1]],
+                [[1, 1, 1, 1], [1, 1, 1, 1]],
+                [[1, 1, 1, 1], [1, 1, 1, 1]],
+            ],
+        ),
     ],
 )
 @pytest.mark.parametrize(

From 8b070c47f27505f5d6c05ce05f1d83b5d66651dc Mon Sep 17 00:00:00 2001
From: eroell <eljas.roellin@ikmail.com>
Date: Fri, 22 Nov 2024 17:02:45 +0100
Subject: [PATCH 43/43] Refactor time interval table query function

---
 src/ehrdata/io/omop/_queries.py |  94 +++--------------
 src/ehrdata/io/omop/omop.py     |  54 +++-------
 tests/test_io/test_omop.py      | 180 ++++++++------------------------
 3 files changed, 79 insertions(+), 249 deletions(-)

diff --git a/src/ehrdata/io/omop/_queries.py b/src/ehrdata/io/omop/_queries.py
index 2cdaa34..f1937c5 100644
--- a/src/ehrdata/io/omop/_queries.py
+++ b/src/ehrdata/io/omop/_queries.py
@@ -115,7 +115,7 @@ def _generate_value_query(data_table: str, data_field_to_keep: Sequence, aggrega
     return is_present_query + value_query
 
 
-def time_interval_table_query_long_format(
+def _time_interval_table(
     backend_handle: duckdb.duckdb.DuckDBPyConnection,
     time_defining_table: str,
     data_table: str,
@@ -124,14 +124,13 @@ def time_interval_table_query_long_format(
     num_intervals: int,
     aggregation_strategy: str,
     data_field_to_keep: Sequence[str] | str,
-    date_prefix: str = "",
-) -> pd.DataFrame:
-    """Returns a long format DataFrame from the data_table. The following columns should be considered the indices of this long format: person_id, data_table_concept_id, interval_step. The other columns, except for start_date and end_date, should be considered the values."""
+    keep_date: str = "",
+):
     if isinstance(data_field_to_keep, str):
         data_field_to_keep = [data_field_to_keep]
 
-    if date_prefix == "":
-        date_prefix = "timepoint"
+    if keep_date == "":
+        keep_date = "timepoint"
 
     timedeltas_dataframe = _generate_timedeltas(interval_length_number, interval_length_unit, num_intervals)
 
@@ -146,8 +145,7 @@ def time_interval_table_query_long_format(
     # 3. Create long_format_backbone, which is the left join of person_time_defining_table and person_data_table.
     # 4. Create long_format_intervals, which is the cross product of long_format_backbone and timedeltas. This table contains most notably the person_id, the concept_id, the interval start and end dates.
     # 5. Create the final table, which is the join with the data_table (typically measurement); each measurement is assigned to its person_id, its concept_id, and the interval it fits into.
-    df = backend_handle.execute(
-        f"""
+    prepare_alias_query = f"""
         WITH person_time_defining_table AS ( \
             SELECT person.person_id as person_id, {DATA_TABLE_DATE_KEYS["start"][time_defining_table]} as start_date, {DATA_TABLE_DATE_KEYS["end"][time_defining_table]} as end_date \
             FROM person \
@@ -176,79 +174,18 @@ def time_interval_table_query_long_format(
             SELECT *, 1 as is_present \
             FROM {data_table} \
         ) \
+        """
+
+    if keep_date in ["timepoint", "start", "end"]:
+        select_query = f"""
         SELECT lfi.person_id, lfi.data_table_concept_id, interval_step, interval_start, interval_end, {_generate_value_query("data_table_with_presence_indicator", data_field_to_keep, AGGREGATION_STRATEGY_KEY[aggregation_strategy])} \
         FROM long_format_intervals as lfi \
-        LEFT JOIN data_table_with_presence_indicator ON lfi.person_id = data_table_with_presence_indicator.person_id AND lfi.data_table_concept_id = data_table_with_presence_indicator.{DATA_TABLE_CONCEPT_ID_TRUNK[data_table]}_concept_id AND data_table_with_presence_indicator.{DATA_TABLE_DATE_KEYS[date_prefix][data_table]} BETWEEN lfi.interval_start AND lfi.interval_end \
+        LEFT JOIN data_table_with_presence_indicator ON lfi.person_id = data_table_with_presence_indicator.person_id AND lfi.data_table_concept_id = data_table_with_presence_indicator.{DATA_TABLE_CONCEPT_ID_TRUNK[data_table]}_concept_id AND data_table_with_presence_indicator.{DATA_TABLE_DATE_KEYS[keep_date][data_table]} BETWEEN lfi.interval_start AND lfi.interval_end \
         GROUP BY lfi.person_id, lfi.data_table_concept_id, interval_step, interval_start, interval_end
         """
-    ).df()
-
-    _drop_timedeltas(backend_handle)
 
-    return df
-
-
-def time_interval_table_for_interval_tables_query_long_format(
-    backend_handle: duckdb.duckdb.DuckDBPyConnection,
-    time_defining_table: str,
-    data_table: str,
-    interval_length_number: int,
-    interval_length_unit: str,
-    num_intervals: int,
-    aggregation_strategy: str,
-    data_field_to_keep: Sequence[str] | str,
-    date_prefix: str = "",
-) -> pd.DataFrame:
-    """Returns a long format DataFrame from the data_table. The following columns should be considered the indices of this long format: person_id, data_table_concept_id, interval_step. The other columns, except for start_date and end_date, should be considered the values."""
-    if isinstance(data_field_to_keep, str):
-        data_field_to_keep = [data_field_to_keep]
-
-    if date_prefix != "":
-        date_prefix = date_prefix + "_"
-
-    timedeltas_dataframe = _generate_timedeltas(interval_length_number, interval_length_unit, num_intervals)
-
-    _write_timedeltas_to_db(
-        backend_handle,
-        timedeltas_dataframe,
-    )
-
-    # multi-step query
-    # 1. Create person_time_defining_table, which matches the one created for obs. Needs to contain the person_id, and the start date in particular.
-    # 2. Create person_data_table (data_table is typically measurement), which contains the cross product of person_id and the distinct concept_id s.
-    # 3. Create long_format_backbone, which is the left join of person_time_defining_table and person_data_table.
-    # 4. Create long_format_intervals, which is the cross product of long_format_backbone and timedeltas. This table contains most notably the person_id, the concept_id, the interval start and end dates.
-    # 5. Create the final table, which is the join with the data_table (typically measurement); each measurement is assigned to its person_id, its concept_id, and the interval it fits into.
-    df = backend_handle.execute(
-        f"""
-        WITH person_time_defining_table AS ( \
-            SELECT person.person_id as person_id, {DATA_TABLE_DATE_KEYS["start"][time_defining_table]} as start_date, {DATA_TABLE_DATE_KEYS["end"][time_defining_table]} as end_date \
-            FROM person \
-            JOIN {time_defining_table} ON person.person_id = {time_defining_table}.{TIME_DEFINING_TABLE_SUBJECT_KEY[time_defining_table]} \
-        ), \
-        person_data_table AS( \
-            WITH distinct_data_table_concept_ids AS ( \
-                SELECT DISTINCT {DATA_TABLE_CONCEPT_ID_TRUNK[data_table]}_concept_id
-                FROM {data_table} \
-            )
-            SELECT person.person_id, {DATA_TABLE_CONCEPT_ID_TRUNK[data_table]}_concept_id as data_table_concept_id \
-            FROM person \
-            CROSS JOIN distinct_data_table_concept_ids \
-        ), \
-        long_format_backbone as ( \
-            SELECT person_time_defining_table.person_id, data_table_concept_id, start_date, end_date \
-            FROM person_time_defining_table \
-            LEFT JOIN person_data_table USING(person_id)\
-        ), \
-        long_format_intervals as ( \
-            SELECT person_id, data_table_concept_id, interval_step, start_date, start_date + interval_start_offset as interval_start, start_date + interval_end_offset as interval_end \
-            FROM long_format_backbone \
-            CROSS JOIN timedeltas \
-        ), \
-        data_table_with_presence_indicator as( \
-            SELECT *, 1 as is_present \
-            FROM {data_table} \
-        ) \
+    elif keep_date == "interval":
+        select_query = f"""
         SELECT lfi.person_id, lfi.data_table_concept_id, interval_step, interval_start, interval_end, {_generate_value_query("data_table_with_presence_indicator", data_field_to_keep, AGGREGATION_STRATEGY_KEY[aggregation_strategy])} \
         FROM long_format_intervals as lfi \
         LEFT JOIN data_table_with_presence_indicator ON lfi.person_id = data_table_with_presence_indicator.person_id \
@@ -258,7 +195,10 @@ def time_interval_table_for_interval_tables_query_long_format(
                     OR (data_table_with_presence_indicator.{DATA_TABLE_DATE_KEYS["start"][data_table]} < lfi.interval_start AND data_table_with_presence_indicator.{DATA_TABLE_DATE_KEYS["end"][data_table]} > lfi.interval_end)) \
         GROUP BY lfi.person_id, lfi.data_table_concept_id, interval_step, interval_start, interval_end
         """
-    ).df()
+
+    query = prepare_alias_query + select_query
+
+    df = backend_handle.execute(query).df()
 
     _drop_timedeltas(backend_handle)
 
diff --git a/src/ehrdata/io/omop/omop.py b/src/ehrdata/io/omop/omop.py
index 85d8de7..8f8a5c4 100644
--- a/src/ehrdata/io/omop/omop.py
+++ b/src/ehrdata/io/omop/omop.py
@@ -31,10 +31,7 @@
     _check_valid_observation_table,
     _check_valid_variable_data_tables,
 )
-from ehrdata.io.omop._queries import (
-    time_interval_table_for_interval_tables_query_long_format,
-    time_interval_table_query_long_format,
-)
+from ehrdata.io.omop._queries import _time_interval_table
 from ehrdata.utils._omop_utils import get_table_catalog_dict
 
 DOWNLOAD_VERIFICATION_TAG = "download_verification_tag"
@@ -335,7 +332,7 @@ def setup_variables(
         return edata
 
     ds = (
-        time_interval_table_query_long_format(
+        _time_interval_table(
             backend_handle=backend_handle,
             time_defining_table=time_defining_table,
             data_table=data_tables[0],
@@ -437,7 +434,7 @@ def setup_interval_variables(
         Strategy to use when aggregating multiple data points within one interval.
     enrich_var_with_feature_info
         Whether to enrich the var table with feature information. If a concept_id is not found in the concept table, the feature information will be NaN.
-    keep_date
+    date_type
         Whether to keep the start or end date, or the interval span.
 
     Returns
@@ -469,38 +466,21 @@ def setup_interval_variables(
         logging.info(f"No data in {data_tables}.")
         return edata
 
-    if keep_date == "start" or keep_date == "end":
-        ds = (
-            time_interval_table_query_long_format(
-                backend_handle=backend_handle,
-                time_defining_table=time_defining_table,
-                data_table=data_tables[0],
-                data_field_to_keep=data_field_to_keep,
-                interval_length_number=interval_length_number,
-                interval_length_unit=interval_length_unit,
-                num_intervals=num_intervals,
-                aggregation_strategy=aggregation_strategy,
-                date_prefix=keep_date,
-            )
-            .set_index(["person_id", "data_table_concept_id", "interval_step"])
-            .to_xarray()
-        )
-    elif keep_date == "interval":
-        ds = (
-            time_interval_table_for_interval_tables_query_long_format(
-                backend_handle=backend_handle,
-                time_defining_table=time_defining_table,
-                data_table=data_tables[0],
-                data_field_to_keep=data_field_to_keep,
-                interval_length_number=interval_length_number,
-                interval_length_unit=interval_length_unit,
-                num_intervals=num_intervals,
-                aggregation_strategy=aggregation_strategy,
-                date_prefix=keep_date,
-            )
-            .set_index(["person_id", "data_table_concept_id", "interval_step"])
-            .to_xarray()
+    ds = (
+        _time_interval_table(
+            backend_handle=backend_handle,
+            time_defining_table=time_defining_table,
+            data_table=data_tables[0],
+            data_field_to_keep=data_field_to_keep,
+            interval_length_number=interval_length_number,
+            interval_length_unit=interval_length_unit,
+            num_intervals=num_intervals,
+            aggregation_strategy=aggregation_strategy,
+            keep_date=keep_date,
         )
+        .set_index(["person_id", "data_table_concept_id", "interval_step"])
+        .to_xarray()
+    )
 
     var = ds["data_table_concept_id"].to_dataframe()
 
diff --git a/tests/test_io/test_omop.py b/tests/test_io/test_omop.py
index 5e77731..ac426e7 100644
--- a/tests/test_io/test_omop.py
+++ b/tests/test_io/test_omop.py
@@ -25,6 +25,24 @@
     "episode": 2,
 }
 
+VANILLA_IS_PRESENT_START = [
+    [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
+    [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
+    [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
+]
+
+VANILLA_IS_PRESENT_END = [
+    [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
+    [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
+    [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
+]
+
+VANILLA_IS_PRESENT_INTERVAL = [
+    [[1, 1, 1, 1], [1, 1, 1, 1]],
+    [[1, 1, 1, 1], [1, 1, 1, 1]],
+    [[1, 1, 1, 1], [1, 1, 1, 1]],
+]
+
 # constants for setup_variables
 # only data_table_concept_id
 VAR_DIM_BASE = 1
@@ -113,11 +131,7 @@ def test_setup_obs_invalid_observation_table_value(omop_connection_vanilla):
         (
             ["measurement"],
             ["is_present"],
-            [
-                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
-                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
-                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
-            ],
+            VANILLA_IS_PRESENT_START,
         ),
         (
             ["observation"],
@@ -131,11 +145,7 @@ def test_setup_obs_invalid_observation_table_value(omop_connection_vanilla):
         (
             ["observation"],
             ["is_present"],
-            [
-                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
-                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
-                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
-            ],
+            VANILLA_IS_PRESENT_START,
         ),
         (
             ["specimen"],
@@ -149,11 +159,7 @@ def test_setup_obs_invalid_observation_table_value(omop_connection_vanilla):
         (
             ["specimen"],
             ["is_present"],
-            [
-                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
-                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
-                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
-            ],
+            VANILLA_IS_PRESENT_START,
         ),
     ],
 )
@@ -242,31 +248,19 @@ def test_setup_variables(
             ["drug_exposure"],
             ["is_present"],
             "start",
-            [
-                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
-                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
-                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
-            ],
+            VANILLA_IS_PRESENT_START,
         ),
         (
             ["drug_exposure"],
             ["is_present"],
             "end",
-            [
-                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
-                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
-                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
-            ],
+            VANILLA_IS_PRESENT_END,
         ),
         (
             ["drug_exposure"],
             ["is_present"],
             "interval",
-            [
-                [[1, 1, 1, 1], [1, 1, 1, 1]],
-                [[1, 1, 1, 1], [1, 1, 1, 1]],
-                [[1, 1, 1, 1], [1, 1, 1, 1]],
-            ],
+            VANILLA_IS_PRESENT_INTERVAL,
         ),
         (
             ["condition_occurrence"],
@@ -302,31 +296,19 @@ def test_setup_variables(
             ["condition_occurrence"],
             ["is_present"],
             "start",
-            [
-                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
-                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
-                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
-            ],
+            VANILLA_IS_PRESENT_START,
         ),
         (
             ["condition_occurrence"],
             ["is_present"],
             "end",
-            [
-                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
-                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
-                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
-            ],
+            VANILLA_IS_PRESENT_END,
         ),
         (
             ["condition_occurrence"],
             ["is_present"],
             "interval",
-            [
-                [[1, 1, 1, 1], [1, 1, 1, 1]],
-                [[1, 1, 1, 1], [1, 1, 1, 1]],
-                [[1, 1, 1, 1], [1, 1, 1, 1]],
-            ],
+            VANILLA_IS_PRESENT_INTERVAL,
         ),
         (
             ["procedure_occurrence"],
@@ -362,31 +344,19 @@ def test_setup_variables(
             ["procedure_occurrence"],
             ["is_present"],
             "start",
-            [
-                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
-                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
-                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
-            ],
+            VANILLA_IS_PRESENT_START,
         ),
         (
             ["procedure_occurrence"],
             ["is_present"],
             "end",
-            [
-                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
-                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
-                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
-            ],
+            VANILLA_IS_PRESENT_END,
         ),
         (
             ["procedure_occurrence"],
             ["is_present"],
             "interval",
-            [
-                [[1, 1, 1, 1], [1, 1, 1, 1]],
-                [[1, 1, 1, 1], [1, 1, 1, 1]],
-                [[1, 1, 1, 1], [1, 1, 1, 1]],
-            ],
+            VANILLA_IS_PRESENT_INTERVAL,
         ),
         (
             ["device_exposure"],
@@ -422,31 +392,19 @@ def test_setup_variables(
             ["device_exposure"],
             ["is_present"],
             "start",
-            [
-                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
-                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
-                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
-            ],
+            VANILLA_IS_PRESENT_START,
         ),
         (
             ["device_exposure"],
             ["is_present"],
             "end",
-            [
-                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
-                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
-                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
-            ],
+            VANILLA_IS_PRESENT_END,
         ),
         (
             ["device_exposure"],
             ["is_present"],
             "interval",
-            [
-                [[1, 1, 1, 1], [1, 1, 1, 1]],
-                [[1, 1, 1, 1], [1, 1, 1, 1]],
-                [[1, 1, 1, 1], [1, 1, 1, 1]],
-            ],
+            VANILLA_IS_PRESENT_INTERVAL,
         ),
         (
             ["drug_era"],
@@ -482,31 +440,19 @@ def test_setup_variables(
             ["drug_era"],
             ["is_present"],
             "start",
-            [
-                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
-                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
-                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
-            ],
+            VANILLA_IS_PRESENT_START,
         ),
         (
             ["drug_era"],
             ["is_present"],
             "end",
-            [
-                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
-                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
-                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
-            ],
+            VANILLA_IS_PRESENT_END,
         ),
         (
             ["drug_era"],
             ["is_present"],
             "interval",
-            [
-                [[1, 1, 1, 1], [1, 1, 1, 1]],
-                [[1, 1, 1, 1], [1, 1, 1, 1]],
-                [[1, 1, 1, 1], [1, 1, 1, 1]],
-            ],
+            VANILLA_IS_PRESENT_INTERVAL,
         ),
         (
             ["dose_era"],
@@ -542,31 +488,19 @@ def test_setup_variables(
             ["dose_era"],
             ["is_present"],
             "start",
-            [
-                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
-                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
-                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
-            ],
+            VANILLA_IS_PRESENT_START,
         ),
         (
             ["dose_era"],
             ["is_present"],
             "end",
-            [
-                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
-                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
-                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
-            ],
+            VANILLA_IS_PRESENT_END,
         ),
         (
             ["dose_era"],
             ["is_present"],
             "interval",
-            [
-                [[1, 1, 1, 1], [1, 1, 1, 1]],
-                [[1, 1, 1, 1], [1, 1, 1, 1]],
-                [[1, 1, 1, 1], [1, 1, 1, 1]],
-            ],
+            VANILLA_IS_PRESENT_INTERVAL,
         ),
         (
             ["condition_era"],
@@ -602,31 +536,19 @@ def test_setup_variables(
             ["condition_era"],
             ["is_present"],
             "start",
-            [
-                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
-                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
-                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
-            ],
+            VANILLA_IS_PRESENT_START,
         ),
         (
             ["condition_era"],
             ["is_present"],
             "end",
-            [
-                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
-                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
-                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
-            ],
+            VANILLA_IS_PRESENT_END,
         ),
         (
             ["condition_era"],
             ["is_present"],
             "interval",
-            [
-                [[1, 1, 1, 1], [1, 1, 1, 1]],
-                [[1, 1, 1, 1], [1, 1, 1, 1]],
-                [[1, 1, 1, 1], [1, 1, 1, 1]],
-            ],
+            VANILLA_IS_PRESENT_INTERVAL,
         ),
         (
             ["episode"],
@@ -662,31 +584,19 @@ def test_setup_variables(
             ["episode"],
             ["is_present"],
             "start",
-            [
-                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
-                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
-                [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]],
-            ],
+            VANILLA_IS_PRESENT_START,
         ),
         (
             ["episode"],
             ["is_present"],
             "end",
-            [
-                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
-                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
-                [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]],
-            ],
+            VANILLA_IS_PRESENT_END,
         ),
         (
             ["episode"],
             ["is_present"],
             "interval",
-            [
-                [[1, 1, 1, 1], [1, 1, 1, 1]],
-                [[1, 1, 1, 1], [1, 1, 1, 1]],
-                [[1, 1, 1, 1], [1, 1, 1, 1]],
-            ],
+            VANILLA_IS_PRESENT_INTERVAL,
         ),
     ],
 )

	concept_id	unit_concept_id	no_units	multiple_units
0	0	9557	False	True
1	0	8749	False	True
2	0	8923	False	True
3	0	8840	False	True
4	0	8859	False	True
...	...	...	...	...
464	42527140	44777590	False	False
465	42868642	<NA>	True	False
466	43055270	<NA>	True	False
467	46236952	<NA>	True	False
468	2000000000	8554	False	False