From 7d7c57e6b4cbaf87e7b1b33296692ad7eb513063 Mon Sep 17 00:00:00 2001 From: Eljas Roellin Date: Sat, 2 Nov 2024 23:00:00 +0100 Subject: [PATCH 01/43] first change in notebook --- .../tutorial_omop_visualization.ipynb | 329 +++++++++++++----- 1 file changed, 244 insertions(+), 85 deletions(-) diff --git a/docs/notebooks/tutorial_omop_visualization.ipynb b/docs/notebooks/tutorial_omop_visualization.ipynb index bc60f09..982a744 100644 --- a/docs/notebooks/tutorial_omop_visualization.ipynb +++ b/docs/notebooks/tutorial_omop_visualization.ipynb @@ -55,18 +55,9 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 1, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The autoreload extension is already loaded. To reload it, use:\n", - " %reload_ext autoreload\n" - ] - } - ], + "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 2" @@ -74,7 +65,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -88,7 +79,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -97,7 +88,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -111,7 +102,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -121,7 +112,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -144,7 +135,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -160,7 +151,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -168,7 +159,8 @@ "output_type": "stream", "text": [ "Path to data exists, load tables from there: ehrapy_data/mimic-iv-demo-data-in-the-omop-common-data-model-0.9\n", - "missing tables: [['concept'], ['vocabulary'], ['domain'], ['concept_class'], ['concept_relationship'], ['relationship'], ['concept_synonym'], ['concept_ancestor'], ['source_to_concept_map'], ['drug_strength']]\n" + "missing tables: ['domain', 'concept_class', 'relationship', 'concept_synonym', 'concept_ancestor', 'source_to_concept_map', 'drug_strength']\n", + "unused files: ['attribute_definition.csv', 'cohort_attribute.csv']\n" ] } ], @@ -185,7 +177,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -231,96 +223,108 @@ " \n", " \n", " 4\n", - " condition_era\n", + " concept\n", " \n", " \n", " 5\n", - " condition_occurrence\n", + " concept_relationship\n", " \n", " \n", " 6\n", - " cost\n", + " condition_era\n", " \n", " \n", " 7\n", - " death\n", + " condition_occurrence\n", " \n", " \n", " 8\n", - " device_exposure\n", + " cost\n", " \n", " \n", " 9\n", - " dose_era\n", + " death\n", " \n", " \n", " 10\n", - " drug_era\n", + " device_exposure\n", " \n", " \n", " 11\n", - " drug_exposure\n", + " dose_era\n", " \n", " \n", " 12\n", - " fact_relationship\n", + " drug_era\n", " \n", " \n", " 13\n", - " location\n", + " drug_exposure\n", " \n", " \n", " 14\n", - " measurement\n", + " fact_relationship\n", " \n", " \n", " 15\n", - " metadata\n", + " location\n", " \n", " \n", " 16\n", - " note\n", + " measurement\n", " \n", " \n", " 17\n", - " note_nlp\n", + " metadata\n", " \n", " \n", " 18\n", - " observation\n", + " note\n", " \n", " \n", " 19\n", - " observation_period\n", + " note_nlp\n", " \n", " \n", " 20\n", - " payer_plan_period\n", + " observation\n", " \n", " \n", " 21\n", - " person\n", + " observation_period\n", " \n", " \n", " 22\n", - " procedure_occurrence\n", + " payer_plan_period\n", " \n", " \n", " 23\n", - " provider\n", + " person\n", " \n", " \n", " 24\n", - " specimen\n", + " procedure_occurrence\n", " \n", " \n", " 25\n", - " visit_detail\n", + " provider\n", " \n", " \n", " 26\n", + " specimen\n", + " \n", + " \n", + " 27\n", + " visit_detail\n", + " \n", + " \n", + " 28\n", " visit_occurrence\n", " \n", + " \n", + " 29\n", + " vocabulary\n", + " \n", " \n", "\n", "" @@ -331,32 +335,35 @@ "1 cdm_source\n", "2 cohort\n", "3 cohort_definition\n", - "4 condition_era\n", - "5 condition_occurrence\n", - "6 cost\n", - "7 death\n", - "8 device_exposure\n", - "9 dose_era\n", - "10 drug_era\n", - "11 drug_exposure\n", - "12 fact_relationship\n", - "13 location\n", - "14 measurement\n", - "15 metadata\n", - "16 note\n", - "17 note_nlp\n", - "18 observation\n", - "19 observation_period\n", - "20 payer_plan_period\n", - "21 person\n", - "22 procedure_occurrence\n", - "23 provider\n", - "24 specimen\n", - "25 visit_detail\n", - "26 visit_occurrence" + "4 concept\n", + "5 concept_relationship\n", + "6 condition_era\n", + "7 condition_occurrence\n", + "8 cost\n", + "9 death\n", + "10 device_exposure\n", + "11 dose_era\n", + "12 drug_era\n", + "13 drug_exposure\n", + "14 fact_relationship\n", + "15 location\n", + "16 measurement\n", + "17 metadata\n", + "18 note\n", + "19 note_nlp\n", + "20 observation\n", + "21 observation_period\n", + "22 payer_plan_period\n", + "23 person\n", + "24 procedure_occurrence\n", + "25 provider\n", + "26 specimen\n", + "27 visit_detail\n", + "28 visit_occurrence\n", + "29 vocabulary" ] }, - "execution_count": 27, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -368,13 +375,13 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "eb69f3700fb343a8b204fde5a22a5d2b", + "model_id": "9ec20041bac2441283e9998549a5a1aa", "version_major": 2, "version_minor": 0 }, @@ -392,7 +399,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -403,7 +410,7 @@ " shape of .r: (0, 0, 0) " ] }, - "execution_count": 29, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -442,13 +449,13 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "260450539f1b4ebba16f17460d50d40f", + "model_id": "df43d6348b9f4c089065ce35d2a7ed78", "version_major": 2, "version_minor": 0 }, @@ -466,37 +473,189 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 14, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "multiple units for features: [[ 0]\n", + " [ 1]\n", + " [ 23]\n", + " [ 55]\n", + " [122]\n", + " [160]\n", + " [245]\n", + " [296]\n", + " [306]\n", + " [349]\n", + " [418]]\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
concept_idunit_concept_idno_unitsmultiple_units
009557FalseTrue
108749FalseTrue
208923FalseTrue
308840FalseTrue
408859FalseTrue
...............
4644252714044777590FalseFalse
46542868642<NA>TrueFalse
46643055270<NA>TrueFalse
46746236952<NA>TrueFalse
46820000000008554FalseFalse
\n", + "

469 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " concept_id unit_concept_id no_units multiple_units\n", + "0 0 9557 False True\n", + "1 0 8749 False True\n", + "2 0 8923 False True\n", + "3 0 8840 False True\n", + "4 0 8859 False True\n", + ".. ... ... ... ...\n", + "464 42527140 44777590 False False\n", + "465 42868642 True False\n", + "466 43055270 True False\n", + "467 46236952 True False\n", + "468 2000000000 8554 False False\n", + "\n", + "[469 rows x 4 columns]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "edata = ed.io.omop.setup_variables(\n", - " backend_handle=con,\n", " edata=edata,\n", - " tables=list(selected_vars.value),\n", - " start_time=\"observation_period_start_date\",\n", - " interval_length_number=28,\n", + " backend_handle=con,\n", + " data_tables=list(selected_vars.value),\n", + " data_field_to_keep=[\"value_as_number\"],\n", + " interval_length_number=20,\n", " interval_length_unit=\"day\",\n", - " num_intervals=\"max_observation_duration\",\n", + " num_intervals=10,\n", " concept_ids=\"all\",\n", " aggregation_strategy=\"last\",\n", - ")" + " enrich_var_with_feature_info=True,\n", + " enrich_var_with_unit_info=False,\n", + ")\n", + "edata.uns[\"unit_report_measurement\"]" ] }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "EHRData object with n_obs x n_var = 100 x 450, and a timeseries of 320 steps.\n", + "EHRData object with n_obs x n_var = 100 x 450, and a timeseries of 10 steps.\n", " shape of .X: (0, 0) \n", - " shape of .r: (100, 450, 320) " + " shape of .r: (100, 450, 10) " ] }, - "execution_count": 32, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -507,7 +666,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ From 09ce9b540911b6a097abc1fc8db5e8c61c337365 Mon Sep 17 00:00:00 2001 From: Eljas Roellin Date: Mon, 4 Nov 2024 11:29:26 +0100 Subject: [PATCH 02/43] inspect why 2 omop dt fail on github ci --- src/ehrdata/dt/datasets.py | 19 +++++++++---------- tests/test_dt/test_dt.py | 37 +++++++++++++++++++++++-------------- 2 files changed, 32 insertions(+), 24 deletions(-) diff --git a/src/ehrdata/dt/datasets.py b/src/ehrdata/dt/datasets.py index 33545be..3db0469 100644 --- a/src/ehrdata/dt/datasets.py +++ b/src/ehrdata/dt/datasets.py @@ -75,7 +75,7 @@ def mimic_iv_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = N >>> con.execute("SHOW TABLES;").fetchall() """ if data_path is None: - data_path = "ehrapy_data/mimic-iv-demo-data-in-the-omop-common-data-model-0.9" + data_path = Path("ehrapy_data/mimic-iv-demo-data-in-the-omop-common-data-model-0.9") if os.path.exists(data_path): print(f"Path to data exists, load tables from there: {data_path}") @@ -85,7 +85,7 @@ def mimic_iv_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = N response = requests.get(URL) if response.status_code == 200: - # Step 2: Use zipfile and io to open the ZIP file in memory + # Use zipfile and io to open the ZIP file in memory with zipfile.ZipFile(io.BytesIO(response.content)) as z: # Extract all contents of the ZIP file z.extractall("ehrapy_data") # Specify the folder where files will be extracted @@ -93,8 +93,8 @@ def mimic_iv_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = N else: print(f"Failed to download the file. Status code: {response.status_code}") return - # TODO: capitalization, and lowercase, and containing the name - return _set_up_duckdb(data_path + "/1_omop_data_csv", backend_handle, prefix="2b_") + + return _set_up_duckdb(data_path / "1_omop_data_csv", backend_handle, prefix="2b_") def gibleed_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = None) -> None: @@ -133,9 +133,6 @@ def gibleed_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = No response = requests.get(URL) if response.status_code == 200: - # extract_path = data_path / "gibleed_data_csv" - # extract_path.mkdir(parents=True, exist_ok=True) - # Use zipfile and io to open the ZIP file in memory with zipfile.ZipFile(io.BytesIO(response.content)) as z: # Extract all contents of the ZIP file into the correct subdirectory @@ -144,16 +141,15 @@ def gibleed_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = No else: print(f"Failed to download the file. Status code: {response.status_code}") + return - # extracted_folder = next(data_path.iterdir(), data_path) - # extracted_folder = next((folder for folder in data_path.iterdir() if folder.is_dir() and "_csv" in folder.name and "__MACOSX" not in folder.name), data_path) return _set_up_duckdb(data_path / "GiBleed_5.3", backend_handle) def synthea27nj_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = None) -> None: """Loads the Synthea27NJ dataset in the OMOP Common Data model. - More details: https://github.com/darwin-eu/EunomiaDatasets/tree/main/datasets/Synthea27Nj. + More details: https://github.com/OHDSI/EunomiaDatasets/tree/main/datasets/Synthea27Nj. Parameters ---------- @@ -214,3 +210,6 @@ def mimic_ii(backend_handle: DuckDBPyConnection, data_path: Path | None = None) """Loads the MIMIC2 dataset""" # TODO: replace mimic_ii as is in ehrapy with its dict-of-table return time - map variables to OMOP? raise NotImplementedError() + + +# TODO: physionet2012, physionet2019 diff --git a/tests/test_dt/test_dt.py b/tests/test_dt/test_dt.py index 72fa7a3..219bf35 100644 --- a/tests/test_dt/test_dt.py +++ b/tests/test_dt/test_dt.py @@ -1,25 +1,34 @@ +from pathlib import Path + import duckdb +import pytest import ehrdata as ed +TEST_DATA_DIR = Path(__file__).parent / "ehrapy_data" + -def test_mimic_iv_omop(): +@pytest.fixture(scope="function") +def duckdb_connection(): + """Fixture to create and return a DuckDB connection for testing.""" con = duckdb.connect() - ed.dt.mimic_iv_omop(backend_handle=con) - assert len(con.execute("SHOW TABLES").df()) == 30 + yield con con.close() -# TODO -# def test_gibleed_omop(): -# con = duckdb.connect() -# ed.dt.gibleed_omop(backend_handle=con) -# assert len(con.execute("SHOW TABLES").df()) == 36 -# con.close() +def test_mimic_iv_omop(duckdb_connection): + ed.dt.mimic_iv_omop(backend_handle=duckdb_connection) + assert len(duckdb_connection.execute("SHOW TABLES").df()) == 30 + assert duckdb_connection.execute("SELECT * FROM person").df().shape == (100, 18) + + +def test_gibleed_omop(duckdb_connection): + ed.dt.gibleed_omop(backend_handle=duckdb_connection) + assert len(duckdb_connection.execute("SHOW TABLES").df()) == 36 + assert duckdb_connection.execute("SELECT * FROM person").df().shape == (2694, 18) -# def test_synthea27nj_omop(): -# con = duckdb.connect() -# ed.dt.synthea27nj_omop(backend_handle=con) -# assert len(con.execute("SHOW TABLES").df()) == 37 -# con.close() +def test_synthea27nj_omop(duckdb_connection): + ed.dt.synthea27nj_omop(backend_handle=duckdb_connection) + assert len(duckdb_connection.execute("SHOW TABLES").df()) == 37 + assert duckdb_connection.execute("SELECT * FROM person").df().shape == (28, 18) From 54fe86e2cc9f86d6675fbed91915fa68781f4fcf Mon Sep 17 00:00:00 2001 From: Eljas Roellin Date: Mon, 4 Nov 2024 11:36:07 +0100 Subject: [PATCH 03/43] inspect why 2 omop dt fail on github ci --- src/ehrdata/dt/datasets.py | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/src/ehrdata/dt/datasets.py b/src/ehrdata/dt/datasets.py index 3db0469..4eb3182 100644 --- a/src/ehrdata/dt/datasets.py +++ b/src/ehrdata/dt/datasets.py @@ -182,28 +182,17 @@ def synthea27nj_omop(backend_handle: DuckDBPyConnection, data_path: Path | None response = requests.get(URL) if response.status_code == 200: - extract_path = data_path / "synthea27nj_omop_csv" - extract_path.mkdir(parents=True, exist_ok=True) - # Use zipfile and io to open the ZIP file in memory with zipfile.ZipFile(io.BytesIO(response.content)) as z: # Extract all contents of the ZIP file into the correct subdirectory - z.extractall(extract_path) # Extracting to 'extract_path' - print(f"Download successful. ZIP file downloaded and extracted successfully to {extract_path}.") + z.extractall(data_path) # Extracting to 'extract_path' + print(f"Download successful. ZIP file downloaded and extracted successfully to {data_path}.") else: print(f"Failed to download the file. Status code: {response.status_code}") return - extracted_folder = next( - ( - folder - for folder in data_path.iterdir() - if folder.is_dir() and "_csv" in folder.name and "__MACOSX" not in folder.name - ), - data_path, - ) - return _set_up_duckdb(extracted_folder, backend_handle) + return _set_up_duckdb(data_path, backend_handle) def mimic_ii(backend_handle: DuckDBPyConnection, data_path: Path | None = None) -> None: From bebe5951153164bf2cc1fa9e2278fbe924010f69 Mon Sep 17 00:00:00 2001 From: Eljas Roellin Date: Mon, 4 Nov 2024 11:42:29 +0100 Subject: [PATCH 04/43] inspect why 2 omop dt fail on github ci --- src/ehrdata/dt/datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ehrdata/dt/datasets.py b/src/ehrdata/dt/datasets.py index 4eb3182..7103771 100644 --- a/src/ehrdata/dt/datasets.py +++ b/src/ehrdata/dt/datasets.py @@ -36,7 +36,7 @@ def _set_up_duckdb(path: Path, backend_handle: DuckDBPyConnection, prefix: str = backend_handle.register( file_name_trunk.replace(prefix, ""), - backend_handle.read_csv(f"{path}/{file_name_trunk}.csv", dtype=dtype), + backend_handle.read_csv(f"{path}/{file_name}", dtype=dtype), ) else: unused_files.append(file_name) From 4c879a1511248e655b9b7d20300ae5595e181721 Mon Sep 17 00:00:00 2001 From: Eljas Roellin Date: Mon, 4 Nov 2024 12:06:25 +0100 Subject: [PATCH 05/43] reduce redundancy; enhance docstrings --- src/ehrdata/dt/datasets.py | 109 ++++++++++++++++++------------------- 1 file changed, 52 insertions(+), 57 deletions(-) diff --git a/src/ehrdata/dt/datasets.py b/src/ehrdata/dt/datasets.py index 7103771..c5bb1b8 100644 --- a/src/ehrdata/dt/datasets.py +++ b/src/ehrdata/dt/datasets.py @@ -18,6 +18,7 @@ def _get_table_list() -> list: def _set_up_duckdb(path: Path, backend_handle: DuckDBPyConnection, prefix: str = "") -> None: + """Create tables in the backend from the CSV files in the path from datasets in the OMOP Common Data model.""" tables = _get_table_list() used_tables = [] @@ -49,10 +50,40 @@ def _set_up_duckdb(path: Path, backend_handle: DuckDBPyConnection, prefix: str = print("unused files: ", unused_files) +def _setup_eunomia_datasets( + backend_handle: DuckDBPyConnection, + data_path: Path | None = None, + URL: str = None, + dataset_postfix: str = "", + dataset_prefix: str = "", +) -> None: + """Loads the Eunomia datasets in the OMOP Common Data model.""" + if os.path.exists(data_path): + print(f"Path to data exists, load tables from there: {data_path}") + else: + print("Downloading data...") + response = requests.get(URL) + + if response.status_code == 200: + # Use zipfile and io to open the ZIP file in memory + with zipfile.ZipFile(io.BytesIO(response.content)) as z: + # Extract all contents of the ZIP file + z.extractall("ehrapy_data") # Specify the folder where files will be extracted + print(f"Download successful. ZIP file downloaded and extracted successfully to {data_path}.") + else: + print(f"Failed to download the file. Status code: {response.status_code}") + return + + return _set_up_duckdb(data_path / dataset_postfix, backend_handle, prefix=dataset_prefix) + + def mimic_iv_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = None) -> None: """Loads the MIMIC-IV demo data in the OMOP Common Data model. - More details: https://physionet.org/content/mimic-iv-demo-omop/0.9/#files-panel. + This function loads the MIMIC-IV demo dataset from its `physionet repository _` . + See also this link for more details. + + DOI https://doi.org/10.13026/2d25-8g07. Parameters ---------- @@ -77,29 +108,19 @@ def mimic_iv_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = N if data_path is None: data_path = Path("ehrapy_data/mimic-iv-demo-data-in-the-omop-common-data-model-0.9") - if os.path.exists(data_path): - print(f"Path to data exists, load tables from there: {data_path}") - else: - print("Downloading data...") - URL = "https://physionet.org/static/published-projects/mimic-iv-demo-omop/mimic-iv-demo-data-in-the-omop-common-data-model-0.9.zip" - response = requests.get(URL) - - if response.status_code == 200: - # Use zipfile and io to open the ZIP file in memory - with zipfile.ZipFile(io.BytesIO(response.content)) as z: - # Extract all contents of the ZIP file - z.extractall("ehrapy_data") # Specify the folder where files will be extracted - print(f"Download successful. ZIP file downloaded and extracted successfully to {data_path}.") - else: - print(f"Failed to download the file. Status code: {response.status_code}") - return - - return _set_up_duckdb(data_path / "1_omop_data_csv", backend_handle, prefix="2b_") + return _setup_eunomia_datasets( + backend_handle, + data_path, + URL="https://physionet.org/static/published-projects/mimic-iv-demo-omop/mimic-iv-demo-data-in-the-omop-common-data-model-0.9.zip", + dataset_postfix="1_omop_data_csv", + dataset_prefix="2b_", + ) def gibleed_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = None) -> None: """Loads the GIBleed dataset in the OMOP Common Data model. + This function loads the GIBleed dataset from the `EunomiaDatasets repository _`. More details: https://github.com/OHDSI/EunomiaDatasets/tree/main/datasets/GiBleed. Parameters @@ -125,30 +146,18 @@ def gibleed_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = No if data_path is None: data_path = Path("ehrapy_data/GIBleed_dataset") - if data_path.exists(): - print(f"Path to data exists, load tables from there: {data_path}") - else: - print("Downloading data...") - URL = "https://github.com/OHDSI/EunomiaDatasets/raw/main/datasets/GiBleed/GiBleed_5.3.zip" - response = requests.get(URL) - - if response.status_code == 200: - # Use zipfile and io to open the ZIP file in memory - with zipfile.ZipFile(io.BytesIO(response.content)) as z: - # Extract all contents of the ZIP file into the correct subdirectory - z.extractall(data_path) # Extracting to 'extract_path' - print(f"Download successful. ZIP file downloaded and extracted successfully to {data_path}.") - - else: - print(f"Failed to download the file. Status code: {response.status_code}") - return - - return _set_up_duckdb(data_path / "GiBleed_5.3", backend_handle) + return _setup_eunomia_datasets( + backend_handle, + data_path, + URL="https://github.com/OHDSI/EunomiaDatasets/raw/main/datasets/GiBleed/GiBleed_5.3.zip", + dataset_postfix="GiBleed_5.3", + ) def synthea27nj_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = None) -> None: """Loads the Synthea27NJ dataset in the OMOP Common Data model. + This function loads the Synthea27NJ dataset from the `EunomiaDatasets repository _`. More details: https://github.com/OHDSI/EunomiaDatasets/tree/main/datasets/Synthea27Nj. Parameters @@ -174,25 +183,11 @@ def synthea27nj_omop(backend_handle: DuckDBPyConnection, data_path: Path | None if data_path is None: data_path = Path("ehrapy_data/Synthea27Nj") - if data_path.exists(): - print(f"Path to data exists, load tables from there: {data_path}") - else: - print("Downloading data...") - URL = "https://github.com/OHDSI/EunomiaDatasets/raw/main/datasets/Synthea27Nj/Synthea27Nj_5.4.zip" - response = requests.get(URL) - - if response.status_code == 200: - # Use zipfile and io to open the ZIP file in memory - with zipfile.ZipFile(io.BytesIO(response.content)) as z: - # Extract all contents of the ZIP file into the correct subdirectory - z.extractall(data_path) # Extracting to 'extract_path' - print(f"Download successful. ZIP file downloaded and extracted successfully to {data_path}.") - - else: - print(f"Failed to download the file. Status code: {response.status_code}") - return - - return _set_up_duckdb(data_path, backend_handle) + return _setup_eunomia_datasets( + backend_handle, + data_path, + URL="https://github.com/OHDSI/EunomiaDatasets/raw/main/datasets/Synthea27Nj/Synthea27Nj_5.4.zip", + ) def mimic_ii(backend_handle: DuckDBPyConnection, data_path: Path | None = None) -> None: From 3253c1a75d8f4077604f7ca0e93bd147b81b66fc Mon Sep 17 00:00:00 2001 From: Eljas Roellin Date: Mon, 4 Nov 2024 12:34:20 +0100 Subject: [PATCH 06/43] fix paths --- src/ehrdata/dt/datasets.py | 8 +++++--- tests/test_dt/test_dt.py | 3 +++ 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/ehrdata/dt/datasets.py b/src/ehrdata/dt/datasets.py index c5bb1b8..803e546 100644 --- a/src/ehrdata/dt/datasets.py +++ b/src/ehrdata/dt/datasets.py @@ -68,8 +68,10 @@ def _setup_eunomia_datasets( # Use zipfile and io to open the ZIP file in memory with zipfile.ZipFile(io.BytesIO(response.content)) as z: # Extract all contents of the ZIP file - z.extractall("ehrapy_data") # Specify the folder where files will be extracted - print(f"Download successful. ZIP file downloaded and extracted successfully to {data_path}.") + z.extractall(data_path) # Specify the folder where files will be extracted + print( + f"Download successful. ZIP file downloaded and extracted successfully to {data_path/dataset_postfix}." + ) else: print(f"Failed to download the file. Status code: {response.status_code}") return @@ -112,7 +114,7 @@ def mimic_iv_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = N backend_handle, data_path, URL="https://physionet.org/static/published-projects/mimic-iv-demo-omop/mimic-iv-demo-data-in-the-omop-common-data-model-0.9.zip", - dataset_postfix="1_omop_data_csv", + dataset_postfix="mimic-iv-demo-data-in-the-omop-common-data-model-0.9/1_omop_data_csv", dataset_prefix="2b_", ) diff --git a/tests/test_dt/test_dt.py b/tests/test_dt/test_dt.py index 219bf35..b78164d 100644 --- a/tests/test_dt/test_dt.py +++ b/tests/test_dt/test_dt.py @@ -19,16 +19,19 @@ def duckdb_connection(): def test_mimic_iv_omop(duckdb_connection): ed.dt.mimic_iv_omop(backend_handle=duckdb_connection) assert len(duckdb_connection.execute("SHOW TABLES").df()) == 30 + # sanity check of one table assert duckdb_connection.execute("SELECT * FROM person").df().shape == (100, 18) def test_gibleed_omop(duckdb_connection): ed.dt.gibleed_omop(backend_handle=duckdb_connection) assert len(duckdb_connection.execute("SHOW TABLES").df()) == 36 + # sanity check of one table assert duckdb_connection.execute("SELECT * FROM person").df().shape == (2694, 18) def test_synthea27nj_omop(duckdb_connection): ed.dt.synthea27nj_omop(backend_handle=duckdb_connection) assert len(duckdb_connection.execute("SHOW TABLES").df()) == 37 + # sanity check of one table assert duckdb_connection.execute("SELECT * FROM person").df().shape == (28, 18) From c32c47e3e483c6f551178db567e3dcbf043eb4d8 Mon Sep 17 00:00:00 2001 From: Eljas Roellin Date: Mon, 4 Nov 2024 17:38:36 +0100 Subject: [PATCH 07/43] towards more datasets --- src/ehrdata/dt/datasets.py | 86 ++++++++++++++++++++++++++++++++++++-- tests/test_dt/test_dt.py | 4 ++ 2 files changed, 86 insertions(+), 4 deletions(-) diff --git a/src/ehrdata/dt/datasets.py b/src/ehrdata/dt/datasets.py index 803e546..dd7b5a3 100644 --- a/src/ehrdata/dt/datasets.py +++ b/src/ehrdata/dt/datasets.py @@ -1,11 +1,17 @@ +from __future__ import annotations + import io import os import zipfile +from collections.abc import Sequence from pathlib import Path +from typing import TYPE_CHECKING import requests from duckdb.duckdb import DuckDBPyConnection +if TYPE_CHECKING: + from ehrdata import EHRData from ehrdata.utils._omop_utils import get_table_catalog_dict @@ -192,10 +198,82 @@ def synthea27nj_omop(backend_handle: DuckDBPyConnection, data_path: Path | None ) -def mimic_ii(backend_handle: DuckDBPyConnection, data_path: Path | None = None) -> None: - """Loads the MIMIC2 dataset""" - # TODO: replace mimic_ii as is in ehrapy with its dict-of-table return time - map variables to OMOP? +def physionet2012( + data_path: Path | None = None, + interval_length_number: int = 1, + interval_length_unit: str = "day", + num_intervals: int = 48, + aggregation_strategy: str = "last", + drop_samples: Sequence[str] = [ + 147514, + 142731, + 145611, + 140501, + 155655, + 143656, + 156254, + 150309, + 140936, + 141264, + 150649, + 142998, + ], +) -> EHRData: + """Loads the dataset of the `PhysioNet challenge 2012 (v1.0.0) _`. + + If interval_length_number is 1, interval_length_unit is "day", and num_intervals is 48, this is equivalent to the SAITS preprocessing (insert paper/link/citation). + Truncated if a sample has more num_intervals steps; Padded if a sample has less than num_intervals steps. + Further, by default the following 12 samples are dropped since they have no time series information at all: 147514, 142731, 145611, 140501, 155655, 143656, 156254, 150309, + 140936, 141264, 150649, 142998. + + Taken the defaults of interval_length_number, interval_length_unit, num_intervals, and drop_samples, the tensor stored in .r of edata is the same as when doing the PyPOTS preprocessing. + A simple deviation is that the tensor in ehrdata is of shape n_obs x n_vars x n_intervals (with defaults, 3000x37x48) while the tensor in PyPOTS is of shape n_obs x n_intervals x n_vars (3000x48x37). + The tensor stored in .r is hence also fully compatible with the PyPOTS package, as the .r tensor of EHRData objects generally is. + + data_path + Path to the raw data. If the path exists, the data is loaded from there. Else, the data is downloaded. + interval_length_number + Numeric value of the length of one interval. + interval_length_unit + Unit belonging to the interval length. + num_intervals + Number of intervals. + aggregation_strategy + Aggregation strategy for the time series data. + drop_samples + Samples to drop from the dataset (indicate their RecordID). + + Returns + ------- + Returns a the processed physionet2012 dataset in an EHRData object. The raw data is also downloaded, stored and available under the data_path. + + Examples + -------- + >>> import ehrapy as ep + >>> import ehrdata as ed + >>> edata = ed.dt.physionet_2012() + >>> edata + """ + if data_path is None: + data_path = Path("ehrapy_data/physionet2012") + + pass + # download data + # load data + # put a/b/c in obs + # put outcomes in obs + # put record id in obs + # put units to var + # put featurenames to var + # put time to t + + +def physionet2019(): + """Loads the dataset of the `PhysioNet challenge 2019 _`.""" raise NotImplementedError() -# TODO: physionet2012, physionet2019 +def mimic_ii(backend_handle: DuckDBPyConnection, data_path: Path | None = None) -> None: + """Loads the MIMIC2 dataset.""" + # TODO: replace mimic_ii as is in ehrapy with its dict-of-table return time - map variables to OMOP? + raise NotImplementedError() diff --git a/tests/test_dt/test_dt.py b/tests/test_dt/test_dt.py index b78164d..122484c 100644 --- a/tests/test_dt/test_dt.py +++ b/tests/test_dt/test_dt.py @@ -35,3 +35,7 @@ def test_synthea27nj_omop(duckdb_connection): assert len(duckdb_connection.execute("SHOW TABLES").df()) == 37 # sanity check of one table assert duckdb_connection.execute("SELECT * FROM person").df().shape == (28, 18) + + +def test_physionet_2012(): + pass From 7dc19031f09f72981d69b0c1adc967ac7ab17b91 Mon Sep 17 00:00:00 2001 From: Eljas Roellin Date: Wed, 6 Nov 2024 13:24:48 +0100 Subject: [PATCH 08/43] load datasets with ehrapy's download function copied; cleaner table extraction --- src/ehrdata/dt/dataloader.py | 112 +++++++++++++++++++++++++++++++++++ src/ehrdata/dt/datasets.py | 94 +++++++++++++++-------------- 2 files changed, 160 insertions(+), 46 deletions(-) create mode 100644 src/ehrdata/dt/dataloader.py diff --git a/src/ehrdata/dt/dataloader.py b/src/ehrdata/dt/dataloader.py new file mode 100644 index 0000000..46db42b --- /dev/null +++ b/src/ehrdata/dt/dataloader.py @@ -0,0 +1,112 @@ +from __future__ import annotations + +import os +import shutil +import tempfile +from pathlib import Path +from random import choice +from string import ascii_lowercase +from typing import Literal + +import requests +from filelock import FileLock +from rich import print +from rich.progress import Progress + + +def download( + url: str, + archive_format: Literal["zip", "tar", "tar.gz", "tgz"] = None, + output_file_name: str = None, + output_path: str | Path = None, + block_size: int = 1024, + overwrite: bool = False, +) -> None: # pragma: no cover + """Downloads a file irrespective of format. + + Args: + url: URL to download. + archive_format: The format if an archive file. + output_file_name: Name of the downloaded file. + output_path: Path to download/extract the files to. Defaults to 'OS tmpdir' if not specified. + block_size: Block size for downloads in bytes. + overwrite: Whether to overwrite existing files. + """ + if output_file_name is None: + letters = ascii_lowercase + output_file_name = f"ehrapy_tmp_{''.join(choice(letters) for _ in range(10))}" + + if output_path is None: + output_path = tempfile.gettempdir() + + def _sanitize_file_name(file_name): + if os.name == "nt": + file_name = file_name.replace("?", "_").replace("*", "_") + return file_name + + download_to_path = Path( + _sanitize_file_name( + f"{output_path}{output_file_name}" + if str(output_path).endswith("/") + else f"{output_path}/{output_file_name}" + ) + ) + + Path(output_path).mkdir(parents=True, exist_ok=True) + lock_path = f"{download_to_path}.lock" + with FileLock(lock_path): + if download_to_path.exists(): + warning = f"[bold red]File {download_to_path} already exists!" + if not overwrite: + print(warning) + return + else: + print(f"{warning} Overwriting...") + + response = requests.get(url, stream=True) + total = int(response.headers.get("content-length", 0)) + + temp_file_name = f"{download_to_path}.part" + + with Progress(refresh_per_second=1500) as progress: + task = progress.add_task("[red]Downloading...", total=total) + with Path(temp_file_name).open("wb") as file: + for data in response.iter_content(block_size): + file.write(data) + progress.update(task, advance=block_size) + + # force the progress bar to 100% at the end + progress.update(task, completed=total, refresh=True) + + Path(temp_file_name).replace(download_to_path) + + if archive_format: + output_path = output_path or tempfile.gettempdir() + shutil.unpack_archive(download_to_path, output_path, format=archive_format) + download_to_path.unlink() + list_of_paths = [path for path in Path(output_path).resolve().glob("*/") if not path.name.startswith(".")] + latest_path = max(list_of_paths, key=lambda path: path.stat().st_ctime) + shutil.move(latest_path, latest_path.parent / remove_archive_extension(output_file_name)) # type: ignore + + Path(lock_path).unlink(missing_ok=True) + + +def remove_archive_extension(file_path): + """Remove the archive extension from the file path.""" + return ( + str(Path(file_path).with_suffix("")) + if any( + Path(file_path).suffix.endswith(ext) + for ext in [ + ".zip", + ".tar", + ".tar.gz", + ".tgz", + ".tar.bz2", + ".tbz2", + ".tar.xz", + ".txz", + ] + ) + else file_path + ) diff --git a/src/ehrdata/dt/datasets.py b/src/ehrdata/dt/datasets.py index dd7b5a3..96d2cb3 100644 --- a/src/ehrdata/dt/datasets.py +++ b/src/ehrdata/dt/datasets.py @@ -1,19 +1,21 @@ from __future__ import annotations -import io import os -import zipfile +import shutil from collections.abc import Sequence from pathlib import Path from typing import TYPE_CHECKING -import requests from duckdb.duckdb import DuckDBPyConnection +from ehrdata.dt.dataloader import download + if TYPE_CHECKING: from ehrdata import EHRData from ehrdata.utils._omop_utils import get_table_catalog_dict +DOWNLOAD_VERIFICATION_TAG = "download_verification_tag" + def _get_table_list() -> list: flat_table_list = [] @@ -45,7 +47,7 @@ def _set_up_duckdb(path: Path, backend_handle: DuckDBPyConnection, prefix: str = file_name_trunk.replace(prefix, ""), backend_handle.read_csv(f"{path}/{file_name}", dtype=dtype), ) - else: + elif file_name_trunk != DOWNLOAD_VERIFICATION_TAG: unused_files.append(file_name) for table in tables: @@ -59,30 +61,26 @@ def _set_up_duckdb(path: Path, backend_handle: DuckDBPyConnection, prefix: str = def _setup_eunomia_datasets( backend_handle: DuckDBPyConnection, data_path: Path | None = None, - URL: str = None, - dataset_postfix: str = "", + data_url: str = None, + nested_omop_table_path: str = "", dataset_prefix: str = "", ) -> None: """Loads the Eunomia datasets in the OMOP Common Data model.""" - if os.path.exists(data_path): - print(f"Path to data exists, load tables from there: {data_path}") - else: - print("Downloading data...") - response = requests.get(URL) - - if response.status_code == 200: - # Use zipfile and io to open the ZIP file in memory - with zipfile.ZipFile(io.BytesIO(response.content)) as z: - # Extract all contents of the ZIP file - z.extractall(data_path) # Specify the folder where files will be extracted - print( - f"Download successful. ZIP file downloaded and extracted successfully to {data_path/dataset_postfix}." - ) - else: - print(f"Failed to download the file. Status code: {response.status_code}") - return - - return _set_up_duckdb(data_path / dataset_postfix, backend_handle, prefix=dataset_prefix) + download( + data_url, + archive_format="zip", + output_file_name=DOWNLOAD_VERIFICATION_TAG, + output_path=data_path, + ) + + for file_path in (data_path / DOWNLOAD_VERIFICATION_TAG / nested_omop_table_path).glob("*.csv"): + shutil.move(file_path, data_path) + + _set_up_duckdb( + data_path, + backend_handle, + prefix=dataset_prefix, + ) def mimic_iv_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = None) -> None: @@ -102,7 +100,7 @@ def mimic_iv_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = N Returns ------- - Returns nothing, but adds the tables to the backend via the handle. + Returns nothing, adds the tables to the backend via the handle. Examples -------- @@ -113,20 +111,21 @@ def mimic_iv_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = N >>> ed.dt.mimic_iv_omop(backend_handle=con) >>> con.execute("SHOW TABLES;").fetchall() """ + data_url = "https://physionet.org/static/published-projects/mimic-iv-demo-omop/mimic-iv-demo-data-in-the-omop-common-data-model-0.9.zip" if data_path is None: data_path = Path("ehrapy_data/mimic-iv-demo-data-in-the-omop-common-data-model-0.9") - return _setup_eunomia_datasets( - backend_handle, - data_path, - URL="https://physionet.org/static/published-projects/mimic-iv-demo-omop/mimic-iv-demo-data-in-the-omop-common-data-model-0.9.zip", - dataset_postfix="mimic-iv-demo-data-in-the-omop-common-data-model-0.9/1_omop_data_csv", + _setup_eunomia_datasets( + backend_handle=backend_handle, + data_path=data_path, + data_url=data_url, + nested_omop_table_path="1_omop_data_csv", dataset_prefix="2b_", ) def gibleed_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = None) -> None: - """Loads the GIBleed dataset in the OMOP Common Data model. + """Loads the GiBleed dataset in the OMOP Common Data model. This function loads the GIBleed dataset from the `EunomiaDatasets repository _`. More details: https://github.com/OHDSI/EunomiaDatasets/tree/main/datasets/GiBleed. @@ -140,7 +139,7 @@ def gibleed_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = No Returns ------- - Returns nothing, but adds the tables to the backend via the handle. + Returns nothing, adds the tables to the backend via the handle. Examples -------- @@ -151,21 +150,22 @@ def gibleed_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = No >>> ed.dt.gibleed_omop(backend_handle=con) >>> con.execute("SHOW TABLES;").fetchall() """ + data_url = "https://github.com/OHDSI/EunomiaDatasets/raw/main/datasets/GiBleed/GiBleed_5.3.zip" + if data_path is None: - data_path = Path("ehrapy_data/GIBleed_dataset") + data_path = Path("ehrapy_data/GiBleed") - return _setup_eunomia_datasets( - backend_handle, - data_path, - URL="https://github.com/OHDSI/EunomiaDatasets/raw/main/datasets/GiBleed/GiBleed_5.3.zip", - dataset_postfix="GiBleed_5.3", + _setup_eunomia_datasets( + backend_handle=backend_handle, + data_path=data_path, + data_url=data_url, ) def synthea27nj_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = None) -> None: - """Loads the Synthea27NJ dataset in the OMOP Common Data model. + """Loads the Synthea27Nj dataset in the OMOP Common Data model. - This function loads the Synthea27NJ dataset from the `EunomiaDatasets repository _`. + This function loads the Synthea27Nj dataset from the `EunomiaDatasets repository _`. More details: https://github.com/OHDSI/EunomiaDatasets/tree/main/datasets/Synthea27Nj. Parameters @@ -177,7 +177,7 @@ def synthea27nj_omop(backend_handle: DuckDBPyConnection, data_path: Path | None Returns ------- - Returns nothing, but adds the tables to the backend via the handle. + Returns nothing, adds the tables to the backend via the handle. Examples -------- @@ -188,13 +188,15 @@ def synthea27nj_omop(backend_handle: DuckDBPyConnection, data_path: Path | None >>> ed.dt.synthea27nj_omop(backend_handle=con) >>> con.execute("SHOW TABLES;").fetchall() """ + data_url = "https://github.com/OHDSI/EunomiaDatasets/raw/main/datasets/Synthea27Nj/Synthea27Nj_5.4.zip" + if data_path is None: data_path = Path("ehrapy_data/Synthea27Nj") - return _setup_eunomia_datasets( - backend_handle, - data_path, - URL="https://github.com/OHDSI/EunomiaDatasets/raw/main/datasets/Synthea27Nj/Synthea27Nj_5.4.zip", + _setup_eunomia_datasets( + backend_handle=backend_handle, + data_path=data_path, + data_url=data_url, ) From 3f241fad6ad83abac9f84bc0f8cd960727432c4a Mon Sep 17 00:00:00 2001 From: Eljas Roellin Date: Mon, 11 Nov 2024 13:12:02 +0100 Subject: [PATCH 09/43] remove dataloader --- src/ehrdata/dt/dataloader.py | 112 ----------------------------------- 1 file changed, 112 deletions(-) delete mode 100644 src/ehrdata/dt/dataloader.py diff --git a/src/ehrdata/dt/dataloader.py b/src/ehrdata/dt/dataloader.py deleted file mode 100644 index 46db42b..0000000 --- a/src/ehrdata/dt/dataloader.py +++ /dev/null @@ -1,112 +0,0 @@ -from __future__ import annotations - -import os -import shutil -import tempfile -from pathlib import Path -from random import choice -from string import ascii_lowercase -from typing import Literal - -import requests -from filelock import FileLock -from rich import print -from rich.progress import Progress - - -def download( - url: str, - archive_format: Literal["zip", "tar", "tar.gz", "tgz"] = None, - output_file_name: str = None, - output_path: str | Path = None, - block_size: int = 1024, - overwrite: bool = False, -) -> None: # pragma: no cover - """Downloads a file irrespective of format. - - Args: - url: URL to download. - archive_format: The format if an archive file. - output_file_name: Name of the downloaded file. - output_path: Path to download/extract the files to. Defaults to 'OS tmpdir' if not specified. - block_size: Block size for downloads in bytes. - overwrite: Whether to overwrite existing files. - """ - if output_file_name is None: - letters = ascii_lowercase - output_file_name = f"ehrapy_tmp_{''.join(choice(letters) for _ in range(10))}" - - if output_path is None: - output_path = tempfile.gettempdir() - - def _sanitize_file_name(file_name): - if os.name == "nt": - file_name = file_name.replace("?", "_").replace("*", "_") - return file_name - - download_to_path = Path( - _sanitize_file_name( - f"{output_path}{output_file_name}" - if str(output_path).endswith("/") - else f"{output_path}/{output_file_name}" - ) - ) - - Path(output_path).mkdir(parents=True, exist_ok=True) - lock_path = f"{download_to_path}.lock" - with FileLock(lock_path): - if download_to_path.exists(): - warning = f"[bold red]File {download_to_path} already exists!" - if not overwrite: - print(warning) - return - else: - print(f"{warning} Overwriting...") - - response = requests.get(url, stream=True) - total = int(response.headers.get("content-length", 0)) - - temp_file_name = f"{download_to_path}.part" - - with Progress(refresh_per_second=1500) as progress: - task = progress.add_task("[red]Downloading...", total=total) - with Path(temp_file_name).open("wb") as file: - for data in response.iter_content(block_size): - file.write(data) - progress.update(task, advance=block_size) - - # force the progress bar to 100% at the end - progress.update(task, completed=total, refresh=True) - - Path(temp_file_name).replace(download_to_path) - - if archive_format: - output_path = output_path or tempfile.gettempdir() - shutil.unpack_archive(download_to_path, output_path, format=archive_format) - download_to_path.unlink() - list_of_paths = [path for path in Path(output_path).resolve().glob("*/") if not path.name.startswith(".")] - latest_path = max(list_of_paths, key=lambda path: path.stat().st_ctime) - shutil.move(latest_path, latest_path.parent / remove_archive_extension(output_file_name)) # type: ignore - - Path(lock_path).unlink(missing_ok=True) - - -def remove_archive_extension(file_path): - """Remove the archive extension from the file path.""" - return ( - str(Path(file_path).with_suffix("")) - if any( - Path(file_path).suffix.endswith(ext) - for ext in [ - ".zip", - ".tar", - ".tar.gz", - ".tgz", - ".tar.bz2", - ".tbz2", - ".tar.xz", - ".txz", - ] - ) - else file_path - ) From 1f329ec8a172fa8074f2ed8fc0cc43761cef8e64 Mon Sep 17 00:00:00 2001 From: Eljas Roellin Date: Mon, 11 Nov 2024 13:19:31 +0100 Subject: [PATCH 10/43] remove physio2012 stubs --- src/ehrdata/dt/datasets.py | 73 +------------------------------------- tests/test_dt/test_dt.py | 4 --- 2 files changed, 1 insertion(+), 76 deletions(-) diff --git a/src/ehrdata/dt/datasets.py b/src/ehrdata/dt/datasets.py index 96d2cb3..b762f37 100644 --- a/src/ehrdata/dt/datasets.py +++ b/src/ehrdata/dt/datasets.py @@ -2,7 +2,6 @@ import os import shutil -from collections.abc import Sequence from pathlib import Path from typing import TYPE_CHECKING @@ -11,7 +10,7 @@ from ehrdata.dt.dataloader import download if TYPE_CHECKING: - from ehrdata import EHRData + pass from ehrdata.utils._omop_utils import get_table_catalog_dict DOWNLOAD_VERIFICATION_TAG = "download_verification_tag" @@ -200,76 +199,6 @@ def synthea27nj_omop(backend_handle: DuckDBPyConnection, data_path: Path | None ) -def physionet2012( - data_path: Path | None = None, - interval_length_number: int = 1, - interval_length_unit: str = "day", - num_intervals: int = 48, - aggregation_strategy: str = "last", - drop_samples: Sequence[str] = [ - 147514, - 142731, - 145611, - 140501, - 155655, - 143656, - 156254, - 150309, - 140936, - 141264, - 150649, - 142998, - ], -) -> EHRData: - """Loads the dataset of the `PhysioNet challenge 2012 (v1.0.0) _`. - - If interval_length_number is 1, interval_length_unit is "day", and num_intervals is 48, this is equivalent to the SAITS preprocessing (insert paper/link/citation). - Truncated if a sample has more num_intervals steps; Padded if a sample has less than num_intervals steps. - Further, by default the following 12 samples are dropped since they have no time series information at all: 147514, 142731, 145611, 140501, 155655, 143656, 156254, 150309, - 140936, 141264, 150649, 142998. - - Taken the defaults of interval_length_number, interval_length_unit, num_intervals, and drop_samples, the tensor stored in .r of edata is the same as when doing the PyPOTS preprocessing. - A simple deviation is that the tensor in ehrdata is of shape n_obs x n_vars x n_intervals (with defaults, 3000x37x48) while the tensor in PyPOTS is of shape n_obs x n_intervals x n_vars (3000x48x37). - The tensor stored in .r is hence also fully compatible with the PyPOTS package, as the .r tensor of EHRData objects generally is. - - data_path - Path to the raw data. If the path exists, the data is loaded from there. Else, the data is downloaded. - interval_length_number - Numeric value of the length of one interval. - interval_length_unit - Unit belonging to the interval length. - num_intervals - Number of intervals. - aggregation_strategy - Aggregation strategy for the time series data. - drop_samples - Samples to drop from the dataset (indicate their RecordID). - - Returns - ------- - Returns a the processed physionet2012 dataset in an EHRData object. The raw data is also downloaded, stored and available under the data_path. - - Examples - -------- - >>> import ehrapy as ep - >>> import ehrdata as ed - >>> edata = ed.dt.physionet_2012() - >>> edata - """ - if data_path is None: - data_path = Path("ehrapy_data/physionet2012") - - pass - # download data - # load data - # put a/b/c in obs - # put outcomes in obs - # put record id in obs - # put units to var - # put featurenames to var - # put time to t - - def physionet2019(): """Loads the dataset of the `PhysioNet challenge 2019 _`.""" raise NotImplementedError() diff --git a/tests/test_dt/test_dt.py b/tests/test_dt/test_dt.py index 122484c..b78164d 100644 --- a/tests/test_dt/test_dt.py +++ b/tests/test_dt/test_dt.py @@ -35,7 +35,3 @@ def test_synthea27nj_omop(duckdb_connection): assert len(duckdb_connection.execute("SHOW TABLES").df()) == 37 # sanity check of one table assert duckdb_connection.execute("SELECT * FROM person").df().shape == (28, 18) - - -def test_physionet_2012(): - pass From 3947f3acafd5de2548a26e1464b8ef16fd4df9ca Mon Sep 17 00:00:00 2001 From: Eljas Roellin Date: Mon, 11 Nov 2024 19:48:50 +0100 Subject: [PATCH 11/43] switch to logging instead of prints --- src/ehrdata/__init__.py | 4 ++++ src/ehrdata/dt/dataloader.py | 10 ++++++---- src/ehrdata/dt/datasets.py | 5 +++-- src/ehrdata/logging_config.py | 10 ++++++++++ 4 files changed, 23 insertions(+), 6 deletions(-) create mode 100644 src/ehrdata/logging_config.py diff --git a/src/ehrdata/__init__.py b/src/ehrdata/__init__.py index 62c5c45..d657790 100644 --- a/src/ehrdata/__init__.py +++ b/src/ehrdata/__init__.py @@ -6,3 +6,7 @@ __all__ = ["EHRData", "dt", "io", "pl", "pp", "tl"] __version__ = version("ehrdata") + +from .logging_config import configure_logging + +configure_logging() diff --git a/src/ehrdata/dt/dataloader.py b/src/ehrdata/dt/dataloader.py index bf51f50..a31e568 100644 --- a/src/ehrdata/dt/dataloader.py +++ b/src/ehrdata/dt/dataloader.py @@ -1,5 +1,6 @@ from __future__ import annotations +import logging import os import shutil import tempfile @@ -10,7 +11,8 @@ import requests from filelock import FileLock -from rich import print + +# from rich import print from rich.progress import Progress @@ -56,12 +58,12 @@ def _sanitize_file_name(file_name): lock_path = f"{download_to_path}.lock" with FileLock(lock_path): if _remove_archive_extension(download_to_path).exists(): - warning = f"[bold red]File {_remove_archive_extension(download_to_path)} already exists!" + warning = f"File {_remove_archive_extension(download_to_path)} already exists!" if not overwrite: - print(warning) + logging.info(warning) return else: - print(f"{warning} Overwriting...") + logging.info(f"{warning} Overwriting...") response = requests.get(url, stream=True) total = int(response.headers.get("content-length", 0)) diff --git a/src/ehrdata/dt/datasets.py b/src/ehrdata/dt/datasets.py index ce1e965..43091ef 100644 --- a/src/ehrdata/dt/datasets.py +++ b/src/ehrdata/dt/datasets.py @@ -1,5 +1,6 @@ from __future__ import annotations +import logging import os import shutil from collections.abc import Sequence @@ -57,8 +58,8 @@ def _set_up_duckdb(path: Path, backend_handle: DuckDBPyConnection, prefix: str = if table not in used_tables: missing_tables.append(table) - print("missing tables: ", missing_tables) - print("unused files: ", unused_files) + logging.info(f"missing tables: {missing_tables}") + logging.info(f"unused files: {unused_files}") def _setup_eunomia_datasets( diff --git a/src/ehrdata/logging_config.py b/src/ehrdata/logging_config.py new file mode 100644 index 0000000..66ade38 --- /dev/null +++ b/src/ehrdata/logging_config.py @@ -0,0 +1,10 @@ +import logging + + +def configure_logging(level=logging.INFO): + """Configures logging for the package.""" + logging.basicConfig( + level=level, + format="%(levelname)s - %(message)s", + force=True, + ) From ff0dd740f7e6e71d59dd3370498c29ab46a7effd Mon Sep 17 00:00:00 2001 From: Eljas Roellin Date: Mon, 11 Nov 2024 20:18:57 +0100 Subject: [PATCH 12/43] check individual connections to resolve synthea27nj --- tests/test_dt/test_dt.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/tests/test_dt/test_dt.py b/tests/test_dt/test_dt.py index 8f1ca6d..217a854 100644 --- a/tests/test_dt/test_dt.py +++ b/tests/test_dt/test_dt.py @@ -17,25 +17,31 @@ def duckdb_connection(): con.close() -def test_mimic_iv_omop(duckdb_connection): +def test_mimic_iv_omop(): + duckdb_connection = duckdb.connect() ed.dt.mimic_iv_omop(backend_handle=duckdb_connection) assert len(duckdb_connection.execute("SHOW TABLES").df()) == 30 # sanity check of one table assert duckdb_connection.execute("SELECT * FROM person").df().shape == (100, 18) + duckdb_connection.close() -def test_gibleed_omop(duckdb_connection): +def test_gibleed_omop(): + duckdb_connection = duckdb.connect() ed.dt.gibleed_omop(backend_handle=duckdb_connection) assert len(duckdb_connection.execute("SHOW TABLES").df()) == 36 # sanity check of one table assert duckdb_connection.execute("SELECT * FROM person").df().shape == (2694, 18) + duckdb_connection.close() -def test_synthea27nj_omop(duckdb_connection): +def test_synthea27nj_omop(): + duckdb_connection = duckdb.connect() ed.dt.synthea27nj_omop(backend_handle=duckdb_connection) assert len(duckdb_connection.execute("SHOW TABLES").df()) == 37 # sanity check of one table assert duckdb_connection.execute("SELECT * FROM person").df().shape == (28, 18) + duckdb_connection.close() def test_physionet2012(): From 66f232f3634aec794fc5ae19b19031b063048cf6 Mon Sep 17 00:00:00 2001 From: Eljas Roellin Date: Mon, 11 Nov 2024 20:35:02 +0100 Subject: [PATCH 13/43] try to empty cache --- .github/workflows/test.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index f069c66..68784ef 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -49,6 +49,8 @@ jobs: cache-dependency-path: pyproject.toml - name: Install dependencies run: uv pip install --system ${{ matrix.pip-flags }} ".[dev,test]" + - name: Delete pytest cache + run: rm -rf .pytest_cache - name: Test env: MPLBACKEND: agg From 6a2e3ed06405069e7ae6752e537c400b6cec766b Mon Sep 17 00:00:00 2001 From: Eljas Roellin Date: Mon, 11 Nov 2024 20:39:48 +0100 Subject: [PATCH 14/43] new test dir, undo workflows.yml --- .github/workflows/test.yaml | 2 -- tests/test_dt/test_dt.py | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 68784ef..f069c66 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -49,8 +49,6 @@ jobs: cache-dependency-path: pyproject.toml - name: Install dependencies run: uv pip install --system ${{ matrix.pip-flags }} ".[dev,test]" - - name: Delete pytest cache - run: rm -rf .pytest_cache - name: Test env: MPLBACKEND: agg diff --git a/tests/test_dt/test_dt.py b/tests/test_dt/test_dt.py index 217a854..e951f37 100644 --- a/tests/test_dt/test_dt.py +++ b/tests/test_dt/test_dt.py @@ -6,7 +6,7 @@ import ehrdata as ed -TEST_DATA_DIR = Path(__file__).parent / "ehrapy_data" +TEST_DATA_DIR = Path(__file__).parent / "ehrapy_data2" @pytest.fixture(scope="function") From e1a6abea4607daf353f324a8df5fafea63b297b8 Mon Sep 17 00:00:00 2001 From: Eljas Roellin Date: Mon, 11 Nov 2024 20:45:10 +0100 Subject: [PATCH 15/43] try different dir --- tests/test_dt/test_dt.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_dt/test_dt.py b/tests/test_dt/test_dt.py index e951f37..ff973be 100644 --- a/tests/test_dt/test_dt.py +++ b/tests/test_dt/test_dt.py @@ -19,7 +19,7 @@ def duckdb_connection(): def test_mimic_iv_omop(): duckdb_connection = duckdb.connect() - ed.dt.mimic_iv_omop(backend_handle=duckdb_connection) + ed.dt.mimic_iv_omop(data_path=TEST_DATA_DIR, backend_handle=duckdb_connection) assert len(duckdb_connection.execute("SHOW TABLES").df()) == 30 # sanity check of one table assert duckdb_connection.execute("SELECT * FROM person").df().shape == (100, 18) @@ -28,7 +28,7 @@ def test_mimic_iv_omop(): def test_gibleed_omop(): duckdb_connection = duckdb.connect() - ed.dt.gibleed_omop(backend_handle=duckdb_connection) + ed.dt.gibleed_omop(data_path=TEST_DATA_DIR, backend_handle=duckdb_connection) assert len(duckdb_connection.execute("SHOW TABLES").df()) == 36 # sanity check of one table assert duckdb_connection.execute("SELECT * FROM person").df().shape == (2694, 18) @@ -37,7 +37,7 @@ def test_gibleed_omop(): def test_synthea27nj_omop(): duckdb_connection = duckdb.connect() - ed.dt.synthea27nj_omop(backend_handle=duckdb_connection) + ed.dt.synthea27nj_omop(data_path=TEST_DATA_DIR, backend_handle=duckdb_connection) assert len(duckdb_connection.execute("SHOW TABLES").df()) == 37 # sanity check of one table assert duckdb_connection.execute("SELECT * FROM person").df().shape == (28, 18) From df3827ad0fd70b93a36cc89385d6a50f6d6838c0 Mon Sep 17 00:00:00 2001 From: Eljas Roellin Date: Thu, 14 Nov 2024 20:03:20 +0100 Subject: [PATCH 16/43] improve download and datasets --- src/ehrdata/dt/dataloader.py | 117 ++++++++++++++--------------------- src/ehrdata/dt/datasets.py | 33 +++++----- tests/test_dt/test_dt.py | 16 ++--- 3 files changed, 68 insertions(+), 98 deletions(-) diff --git a/src/ehrdata/dt/dataloader.py b/src/ehrdata/dt/dataloader.py index a31e568..201c956 100644 --- a/src/ehrdata/dt/dataloader.py +++ b/src/ehrdata/dt/dataloader.py @@ -5,9 +5,6 @@ import shutil import tempfile from pathlib import Path -from random import choice -from string import ascii_lowercase -from typing import Literal import requests from filelock import FileLock @@ -18,9 +15,7 @@ def download( url: str, - archive_format: Literal["zip", "tar", "tar.gz", "tgz"] = None, - output_file_name: str = None, - output_path: str | Path = None, + saving_path: Path | str, block_size: int = 1024, overwrite: bool = False, ) -> None: # pragma: no cover @@ -28,47 +23,51 @@ def download( Args: url: URL to download. - archive_format: The format if an archive file. - output_file_name: Name of the downloaded file. - output_path: Path to download/extract the files to. Defaults to 'OS tmpdir' if not specified. - block_size: Block size for downloads in bytes. - overwrite: Whether to overwrite existing files. + download_path: Where the data should be downloaded to. """ - if output_file_name is None: - letters = ascii_lowercase - output_file_name = f"ehrapy_tmp_{''.join(choice(letters) for _ in range(10))}" - - if output_path is None: - output_path = tempfile.gettempdir() - - def _sanitize_file_name(file_name): - if os.name == "nt": - file_name = file_name.replace("?", "_").replace("*", "_") - return file_name - - download_to_path = Path( - _sanitize_file_name( - f"{output_path}{output_file_name}" - if str(output_path).endswith("/") - else f"{output_path}/{output_file_name}" - ) - ) - - Path(output_path).mkdir(parents=True, exist_ok=True) - lock_path = f"{download_to_path}.lock" + # note: tar.gz has to be before gz for the _remove_archive_extension function to remove the entire extension + compression_formats = ["tar.gz", "zip", "tar", "gz", "bz", "xz"] + raw_formats = ["csv", "txt", "parquet"] + + saving_path = Path(saving_path) + # urls can end with "?download" + file_name = os.path.basename(url).split("?")[0] + suffix = file_name.split(".")[-1] + + def _remove_archive_extension(file_path: str) -> str: + for ext in compression_formats: + # if the file path ends with extension, remove the extension and the dot before it (hence the -1) + if file_path.endswith(ext): + return file_path[: -len(ext) - 1] + return file_path + + if suffix in raw_formats: + raw_data_saving_path = saving_path / file_name + path_to_check = raw_data_saving_path + elif suffix in compression_formats: + tmpdir = tempfile.mkdtemp() + raw_data_saving_path = Path(tmpdir) / file_name + path_to_check = saving_path / _remove_archive_extension(file_name) + else: + raise RuntimeError(f"Unknown file format: {suffix}") + return + + if path_to_check.exists(): + info = f"File {path_to_check} already exists!" + if not overwrite: + logging.info(f"{info} Use downloaded dataset...") + return + else: + logging.info(f"{info} Overwriting...") + + logging.info(f"Downloading {file_name} from {url} to {raw_data_saving_path}") + + lock_path = f"{raw_data_saving_path}.lock" with FileLock(lock_path): - if _remove_archive_extension(download_to_path).exists(): - warning = f"File {_remove_archive_extension(download_to_path)} already exists!" - if not overwrite: - logging.info(warning) - return - else: - logging.info(f"{warning} Overwriting...") - response = requests.get(url, stream=True) total = int(response.headers.get("content-length", 0)) - temp_file_name = f"{download_to_path}.part" + temp_file_name = f"{raw_data_saving_path}.part" with Progress(refresh_per_second=1500) as progress: task = progress.add_task("[red]Downloading...", total=total) @@ -80,34 +79,12 @@ def _sanitize_file_name(file_name): # force the progress bar to 100% at the end progress.update(task, completed=total, refresh=True) - Path(temp_file_name).replace(download_to_path) + Path(temp_file_name).replace(raw_data_saving_path) - if archive_format: - output_path = output_path or tempfile.gettempdir() - shutil.unpack_archive(download_to_path, output_path, format=archive_format) - download_to_path.unlink() - list_of_paths = [path for path in Path(output_path).resolve().glob("*/") if not path.name.startswith(".")] - latest_path = max(list_of_paths, key=lambda path: path.stat().st_ctime) - shutil.move( - latest_path, - latest_path.parent / _remove_archive_extension(output_file_name), - ) # type: ignore + if suffix in compression_formats: + shutil.unpack_archive(raw_data_saving_path, saving_path) + logging.info( + f"Extracted archive {file_name} from {raw_data_saving_path} to {saving_path / _remove_archive_extension(file_name)}" + ) Path(lock_path).unlink(missing_ok=True) - - -def _remove_archive_extension(file_path): - path = Path(file_path) - for ext in [ - ".tar.gz", - ".tgz", - ".tar.bz2", - ".tbz2", - ".tar.xz", - ".txz", - ".zip", - ".tar", - ]: - if str(path).endswith(ext): - return Path(str(path)[: -len(ext)]) - return Path(path) diff --git a/src/ehrdata/dt/datasets.py b/src/ehrdata/dt/datasets.py index 43091ef..38b222a 100644 --- a/src/ehrdata/dt/datasets.py +++ b/src/ehrdata/dt/datasets.py @@ -63,22 +63,21 @@ def _set_up_duckdb(path: Path, backend_handle: DuckDBPyConnection, prefix: str = def _setup_eunomia_datasets( + data_url: str, backend_handle: DuckDBPyConnection, data_path: Path | None = None, - data_url: str = None, - nested_omop_table_path: str = "", + nested_omop_tables_folder: str = None, dataset_prefix: str = "", ) -> None: """Loads the Eunomia datasets in the OMOP Common Data model.""" download( data_url, - archive_format="zip", - output_file_name=DOWNLOAD_VERIFICATION_TAG, - output_path=data_path, + saving_path=data_path, ) - for file_path in (data_path / DOWNLOAD_VERIFICATION_TAG / nested_omop_table_path).glob("*.csv"): - shutil.move(file_path, data_path) + if nested_omop_tables_folder: + for file_path in (data_path / nested_omop_tables_folder).glob("*.csv"): + shutil.move(file_path, data_path) _set_up_duckdb( data_path, @@ -120,10 +119,10 @@ def mimic_iv_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = N data_path = Path("ehrapy_data/mimic-iv-demo-data-in-the-omop-common-data-model-0.9") _setup_eunomia_datasets( + data_url=data_url, backend_handle=backend_handle, data_path=data_path, - data_url=data_url, - nested_omop_table_path="1_omop_data_csv", + nested_omop_tables_folder="mimic-iv-demo-data-in-the-omop-common-data-model-0.9/1_omop_data_csv", dataset_prefix="2b_", ) @@ -157,12 +156,13 @@ def gibleed_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = No data_url = "https://github.com/OHDSI/EunomiaDatasets/raw/main/datasets/GiBleed/GiBleed_5.3.zip" if data_path is None: - data_path = Path("ehrapy_data/GiBleed") + data_path = Path("ehrapy_data/GiBleed_5.3") _setup_eunomia_datasets( + data_url=data_url, backend_handle=backend_handle, data_path=data_path, - data_url=data_url, + nested_omop_tables_folder="GiBleed_5.3", ) @@ -195,12 +195,12 @@ def synthea27nj_omop(backend_handle: DuckDBPyConnection, data_path: Path | None data_url = "https://github.com/OHDSI/EunomiaDatasets/raw/main/datasets/Synthea27Nj/Synthea27Nj_5.4.zip" if data_path is None: - data_path = Path("ehrapy_data/Synthea27Nj") + data_path = Path("ehrapy_data/Synthea27Nj_5.4") _setup_eunomia_datasets( + data_url=data_url, backend_handle=backend_handle, data_path=data_path, - data_url=data_url, ) @@ -289,16 +289,13 @@ def physionet2012( for file_name in temp_data_set_names: download( url=f"https://physionet.org/files/challenge-2012/1.0.0/{file_name}.tar.gz?download", - output_path=data_path, - output_file_name=file_name + ".tar.gz", - archive_format="gztar", + saving_path=data_path, ) for file_name in outcome_file_names: download( url=f"https://physionet.org/files/challenge-2012/1.0.0/{file_name}?download", - output_path=data_path, - output_file_name=file_name, + saving_path=data_path, ) static_features = ["Age", "Gender", "ICUType", "Height"] diff --git a/tests/test_dt/test_dt.py b/tests/test_dt/test_dt.py index ff973be..02fb030 100644 --- a/tests/test_dt/test_dt.py +++ b/tests/test_dt/test_dt.py @@ -1,13 +1,9 @@ -from pathlib import Path - import duckdb import numpy as np import pytest import ehrdata as ed -TEST_DATA_DIR = Path(__file__).parent / "ehrapy_data2" - @pytest.fixture(scope="function") def duckdb_connection(): @@ -17,27 +13,27 @@ def duckdb_connection(): con.close() -def test_mimic_iv_omop(): +def test_mimic_iv_omop(tmp_path): duckdb_connection = duckdb.connect() - ed.dt.mimic_iv_omop(data_path=TEST_DATA_DIR, backend_handle=duckdb_connection) + ed.dt.mimic_iv_omop(data_path=tmp_path, backend_handle=duckdb_connection) assert len(duckdb_connection.execute("SHOW TABLES").df()) == 30 # sanity check of one table assert duckdb_connection.execute("SELECT * FROM person").df().shape == (100, 18) duckdb_connection.close() -def test_gibleed_omop(): +def test_gibleed_omop(tmp_path): duckdb_connection = duckdb.connect() - ed.dt.gibleed_omop(data_path=TEST_DATA_DIR, backend_handle=duckdb_connection) + ed.dt.gibleed_omop(data_path=tmp_path, backend_handle=duckdb_connection) assert len(duckdb_connection.execute("SHOW TABLES").df()) == 36 # sanity check of one table assert duckdb_connection.execute("SELECT * FROM person").df().shape == (2694, 18) duckdb_connection.close() -def test_synthea27nj_omop(): +def test_synthea27nj_omop(tmp_path): duckdb_connection = duckdb.connect() - ed.dt.synthea27nj_omop(data_path=TEST_DATA_DIR, backend_handle=duckdb_connection) + ed.dt.synthea27nj_omop(data_path=tmp_path, backend_handle=duckdb_connection) assert len(duckdb_connection.execute("SHOW TABLES").df()) == 37 # sanity check of one table assert duckdb_connection.execute("SELECT * FROM person").df().shape == (28, 18) From 45f388ed8b47ea440b503348881b69eb168ea9fc Mon Sep 17 00:00:00 2001 From: Eljas Roellin Date: Fri, 15 Nov 2024 11:47:34 +0100 Subject: [PATCH 17/43] fix pandas warning --- src/ehrdata/dt/datasets.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ehrdata/dt/datasets.py b/src/ehrdata/dt/datasets.py index 38b222a..3a1eb54 100644 --- a/src/ehrdata/dt/datasets.py +++ b/src/ehrdata/dt/datasets.py @@ -49,7 +49,7 @@ def _set_up_duckdb(path: Path, backend_handle: DuckDBPyConnection, prefix: str = backend_handle.register( file_name_trunk.replace(prefix, ""), - backend_handle.read_csv(f"{path}/{file_name}", dtype=dtype), + backend_handle.read_csv(f"{path}/{file_name}", dtype=dtype, delim=","), ) elif file_name_trunk != DOWNLOAD_VERIFICATION_TAG: unused_files.append(file_name) @@ -352,7 +352,7 @@ def physionet2012( df_long_time_seconds = np.array(pd.to_timedelta(df_long["Time"] + ":00").dt.total_seconds()) interval_df_interval_end_offset_seconds = np.array(interval_df["interval_end_offset"].dt.total_seconds()) df_long_interval_step = np.argmax(df_long_time_seconds[:, None] <= interval_df_interval_end_offset_seconds, axis=1) - df_long["interval_step"] = df_long_interval_step + df_long.loc[:, ["interval_step"]] = df_long_interval_step # if one person for one feature (=Parameter) within one interval_step has multiple measurements, decide which one to keep df_long = df_long.drop_duplicates(subset=["RecordID", "Parameter", "interval_step"], keep=aggregation_strategy) From f4df4ffc2b2ff0e0a7f4dfe4db61f35bf877a18b Mon Sep 17 00:00:00 2001 From: Eljas Roellin Date: Fri, 15 Nov 2024 11:48:57 +0100 Subject: [PATCH 18/43] use , delimiter in duckdb --- src/ehrdata/dt/datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ehrdata/dt/datasets.py b/src/ehrdata/dt/datasets.py index 3a1eb54..c6dc72a 100644 --- a/src/ehrdata/dt/datasets.py +++ b/src/ehrdata/dt/datasets.py @@ -49,7 +49,7 @@ def _set_up_duckdb(path: Path, backend_handle: DuckDBPyConnection, prefix: str = backend_handle.register( file_name_trunk.replace(prefix, ""), - backend_handle.read_csv(f"{path}/{file_name}", dtype=dtype, delim=","), + backend_handle.read_csv(f"{path}/{file_name}", dtype=dtype, delimiter=","), ) elif file_name_trunk != DOWNLOAD_VERIFICATION_TAG: unused_files.append(file_name) From bbc663e31b995a90355c24e998ca961dd38470ab Mon Sep 17 00:00:00 2001 From: Eljas Roellin Date: Fri, 15 Nov 2024 11:52:15 +0100 Subject: [PATCH 19/43] remove extract's from api.md --- docs/api.md | 10 ---------- src/ehrdata/io/omop/omop.py | 11 ----------- 2 files changed, 21 deletions(-) diff --git a/docs/api.md b/docs/api.md index 12fb317..2dc62aa 100644 --- a/docs/api.md +++ b/docs/api.md @@ -24,16 +24,6 @@ io.omop.setup_variables io.omop.get_time_interval_table io.omop.load - io.omop.extract_person - io.omop.extract_observation_period - io.omop.extract_measurement - io.omop.extract_observation - io.omop.extract_procedure_occurrence - io.omop.extract_specimen - io.omop.extract_device_exposure - io.omop.extract_drug_exposure - io.omop.extract_condition_occurrence - io.omop.extract_note ``` ## Datasets diff --git a/src/ehrdata/io/omop/omop.py b/src/ehrdata/io/omop/omop.py index 6034b17..f9b460f 100644 --- a/src/ehrdata/io/omop/omop.py +++ b/src/ehrdata/io/omop/omop.py @@ -460,17 +460,6 @@ def extract_procedure_occurrence(duckdb_instance): ) -def extract_specimen(duckdb_instance): - """Extract a table of an OMOP CDM Database.""" - return get_table( - duckdb_instance, - table_name="specimen", - concept_id_col="specimen_concept_id", - value_col="unit_concept_id", # Assuming `unit_concept_id` is a suitable value field - timestamp_col="specimen_datetime", - ) - - def extract_device_exposure(duckdb_instance): """Extract a table of an OMOP CDM Database.""" # return get_table( From 839ff091bda2251f66bfb4f93e437d591814cdd3 Mon Sep 17 00:00:00 2001 From: Eljas Roellin Date: Fri, 15 Nov 2024 12:22:22 +0100 Subject: [PATCH 20/43] read w/ pandas instead of duckdb --- src/ehrdata/dt/datasets.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/ehrdata/dt/datasets.py b/src/ehrdata/dt/datasets.py index c6dc72a..23dac96 100644 --- a/src/ehrdata/dt/datasets.py +++ b/src/ehrdata/dt/datasets.py @@ -47,10 +47,12 @@ def _set_up_duckdb(path: Path, backend_handle: DuckDBPyConnection, prefix: str = else: dtype = None - backend_handle.register( - file_name_trunk.replace(prefix, ""), - backend_handle.read_csv(f"{path}/{file_name}", dtype=dtype, delimiter=","), - ) + df = pd.read_csv(f"{path}/{file_name}", dtype=dtype) # noqa: F841 + backend_handle.execute(f"CREATE TABLE {file_name_trunk.replace(prefix, '')} AS SELECT * FROM df") + # backend_handle.register( + # file_name_trunk.replace(prefix, ""), + # backend_handle.read_csv(f"{path}/{file_name}", dtype=dtype, delimiter=","), + # ) elif file_name_trunk != DOWNLOAD_VERIFICATION_TAG: unused_files.append(file_name) From dafda91e1daee62015fa2fb7665697a33810d0a4 Mon Sep 17 00:00:00 2001 From: Eljas Roellin Date: Sun, 17 Nov 2024 18:59:28 +0100 Subject: [PATCH 21/43] support different capitalizations; check if pre-release fail again..? --- src/ehrdata/dt/datasets.py | 66 +++------ src/ehrdata/io/omop/__init__.py | 3 +- src/ehrdata/io/omop/omop.py | 129 ++++++++++++------ tests/conftest.py | 12 +- .../toy_omop/capital_letters/MEASUREMENT.csv | 2 + .../capital_letters/OBSERVATION_PERIOD.csv | 2 + .../data/toy_omop/capital_letters/PERSON.csv | 2 + tests/test_io/test_omop.py | 27 ++++ 8 files changed, 146 insertions(+), 97 deletions(-) create mode 100644 tests/data/toy_omop/capital_letters/MEASUREMENT.csv create mode 100644 tests/data/toy_omop/capital_letters/OBSERVATION_PERIOD.csv create mode 100644 tests/data/toy_omop/capital_letters/PERSON.csv diff --git a/src/ehrdata/dt/datasets.py b/src/ehrdata/dt/datasets.py index 23dac96..152a3eb 100644 --- a/src/ehrdata/dt/datasets.py +++ b/src/ehrdata/dt/datasets.py @@ -1,7 +1,5 @@ from __future__ import annotations -import logging -import os import shutil from collections.abc import Sequence from pathlib import Path @@ -12,56 +10,18 @@ from duckdb.duckdb import DuckDBPyConnection from ehrdata.dt.dataloader import download +from ehrdata.io.omop import setup_connection from ehrdata.io.omop._queries import _generate_timedeltas -from ehrdata.utils._omop_utils import get_table_catalog_dict if TYPE_CHECKING: from ehrdata import EHRData -DOWNLOAD_VERIFICATION_TAG = "download_verification_tag" - -def _get_table_list() -> list: - flat_table_list = [] - for _, value_list in get_table_catalog_dict().items(): - for value in value_list: - flat_table_list.append(value) - return flat_table_list - - -def _set_up_duckdb(path: Path, backend_handle: DuckDBPyConnection, prefix: str = "") -> None: - """Create tables in the backend from the CSV files in the path from datasets in the OMOP Common Data model.""" - tables = _get_table_list() - - used_tables = [] - missing_tables = [] - unused_files = [] - for file_name in os.listdir(path): - file_name_trunk = file_name.split(".")[0].lower() - - if file_name_trunk in tables or file_name_trunk.replace(prefix, "") in tables: - used_tables.append(file_name_trunk.replace(prefix, "")) - - if file_name_trunk == "measurement": - dtype = {"measurement_source_value": str} - else: - dtype = None - - df = pd.read_csv(f"{path}/{file_name}", dtype=dtype) # noqa: F841 - backend_handle.execute(f"CREATE TABLE {file_name_trunk.replace(prefix, '')} AS SELECT * FROM df") - # backend_handle.register( - # file_name_trunk.replace(prefix, ""), - # backend_handle.read_csv(f"{path}/{file_name}", dtype=dtype, delimiter=","), - # ) - elif file_name_trunk != DOWNLOAD_VERIFICATION_TAG: - unused_files.append(file_name) - - for table in tables: - if table not in used_tables: - missing_tables.append(table) - - logging.info(f"missing tables: {missing_tables}") - logging.info(f"unused files: {unused_files}") +COLUMN_CASE = { + "uppercase": "uppercase", + "lowercase": "lowercase", + "titlecase": "titlecase", +} def _setup_eunomia_datasets( @@ -81,11 +41,12 @@ def _setup_eunomia_datasets( for file_path in (data_path / nested_omop_tables_folder).glob("*.csv"): shutil.move(file_path, data_path) - _set_up_duckdb( + edata = setup_connection( data_path, backend_handle, prefix=dataset_prefix, ) + return edata def mimic_iv_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = None) -> None: @@ -120,13 +81,14 @@ def mimic_iv_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = N if data_path is None: data_path = Path("ehrapy_data/mimic-iv-demo-data-in-the-omop-common-data-model-0.9") - _setup_eunomia_datasets( + edata = _setup_eunomia_datasets( data_url=data_url, backend_handle=backend_handle, data_path=data_path, nested_omop_tables_folder="mimic-iv-demo-data-in-the-omop-common-data-model-0.9/1_omop_data_csv", dataset_prefix="2b_", ) + return edata def gibleed_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = None) -> None: @@ -160,13 +122,15 @@ def gibleed_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = No if data_path is None: data_path = Path("ehrapy_data/GiBleed_5.3") - _setup_eunomia_datasets( + edata = _setup_eunomia_datasets( data_url=data_url, backend_handle=backend_handle, data_path=data_path, nested_omop_tables_folder="GiBleed_5.3", ) + return edata + def synthea27nj_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = None) -> None: """Loads the Synthea27Nj dataset in the OMOP Common Data model. @@ -199,12 +163,14 @@ def synthea27nj_omop(backend_handle: DuckDBPyConnection, data_path: Path | None if data_path is None: data_path = Path("ehrapy_data/Synthea27Nj_5.4") - _setup_eunomia_datasets( + edata = _setup_eunomia_datasets( data_url=data_url, backend_handle=backend_handle, data_path=data_path, ) + return edata + def mimic_ii(backend_handle: DuckDBPyConnection, data_path: Path | None = None) -> None: """Loads the MIMIC2 dataset.""" diff --git a/src/ehrdata/io/omop/__init__.py b/src/ehrdata/io/omop/__init__.py index 8cd4668..6fb9860 100644 --- a/src/ehrdata/io/omop/__init__.py +++ b/src/ehrdata/io/omop/__init__.py @@ -1,7 +1,6 @@ from .omop import ( get_table, get_time_interval_table, - load, # extract_condition_occurrence, # extract_device_exposure, # extract_drug_exposure, @@ -13,7 +12,7 @@ # extract_person_observation_period, # extract_procedure_occurrence, # extract_specimen, - register_omop_to_db_connection, + setup_connection, setup_obs, setup_variables, ) diff --git a/src/ehrdata/io/omop/omop.py b/src/ehrdata/io/omop/omop.py index f9b460f..3d6efb6 100644 --- a/src/ehrdata/io/omop/omop.py +++ b/src/ehrdata/io/omop/omop.py @@ -1,5 +1,6 @@ from __future__ import annotations +import logging import os from collections.abc import Sequence from pathlib import Path @@ -9,24 +10,78 @@ import duckdb import numpy as np import pandas as pd +from duckdb.duckdb import DuckDBPyConnection from ehrdata.io.omop._queries import ( AGGREGATION_STRATEGY_KEY, time_interval_table_query_long_format, ) -from ehrdata.utils._omop_utils import get_omop_table_names +from ehrdata.utils._omop_utils import get_table_catalog_dict + +DOWNLOAD_VERIFICATION_TAG = "download_verification_tag" VALID_OBSERVATION_TABLES_SINGLE = ["person"] VALID_OBSERVATION_TABLES_JOIN = ["person_cohort", "person_observation_period", "person_visit_occurrence"] VALID_VARIABLE_TABLES = ["measurement", "observation", "specimen"] -def _check_sanity_of_folder(folder_path: str | Path): - pass +def _get_table_list() -> list: + flat_table_list = [] + for _, value_list in get_table_catalog_dict().items(): + for value in value_list: + flat_table_list.append(value) + return flat_table_list -def _check_sanity_of_database(backend_handle: duckdb.DuckDB): - pass +def _set_up_duckdb(path: Path, backend_handle: DuckDBPyConnection, prefix: str = "") -> str: + """Create tables in the backend from the CSV files in the path from datasets in the OMOP Common Data model.""" + tables = _get_table_list() + + used_tables = [] + missing_tables = [] + unused_files = [] + for file_name in os.listdir(path): + file_name_trunk = file_name.split(".")[0].lower() + regular_omop_table_name = file_name_trunk.replace(prefix, "") + + if regular_omop_table_name in tables: + used_tables.append(regular_omop_table_name) + + if regular_omop_table_name == "measurement": + dtype = {"measurement_source_value": str} + else: + dtype = None + + # read raw csv as temporary table + temp_relation = backend_handle.read_csv(path / file_name, dtype=dtype) # noqa: F841 + backend_handle.execute("CREATE OR REPLACE TABLE temp_table AS SELECT * FROM temp_relation") + + # make query to create table with lowercase column names + column_names = backend_handle.execute("DESCRIBE temp_table").df()["column_name"].values + select_columns = ", ".join([f'"{col}" AS "{col.lower()}"' for col in column_names]) + create_table_with_lowercase_columns_query = ( + f"CREATE TABLE {regular_omop_table_name} AS SELECT {select_columns} FROM temp_table" + ) + + # write proper table + existing_tables = backend_handle.execute("SHOW TABLES").df()["name"].values + if regular_omop_table_name in existing_tables: + logging.info(f"Table {regular_omop_table_name} already exists. Dropping and recreating...") + backend_handle.execute(f"DROP TABLE {regular_omop_table_name}") + + backend_handle.execute(create_table_with_lowercase_columns_query) + + backend_handle.execute("DROP TABLE temp_table") + + elif file_name_trunk != DOWNLOAD_VERIFICATION_TAG: + unused_files.append(file_name) + + for table in tables: + if table not in used_tables: + missing_tables.append(table) + + logging.info(f"missing tables: {missing_tables}") + logging.info(f"unused files: {unused_files}") def _check_valid_backend_handle(backend_handle) -> None: @@ -182,28 +237,28 @@ def _create_enriched_var_with_unit_info(backend_handle, ds, var, unit_report) -> return feature_concept_id_unit_info_table -def register_omop_to_db_connection( - path: Path, - backend_handle: duckdb.duckdb.DuckDBPyConnection, - source: Literal["csv"] = "csv", -) -> None: - """Register the OMOP CDM tables to the database.""" - missing_tables = [] - for table in get_omop_table_names(): - # if path exists lowercse, uppercase, capitalized: - table_path = f"{path}/{table}.csv" - if os.path.exists(table_path): - if table == "measurement": - backend_handle.register( - table, backend_handle.read_csv(f"{path}/{table}.csv", dtype={"measurement_source_value": str}) - ) - else: - backend_handle.register(table, backend_handle.read_csv(f"{path}/{table}.csv")) - else: - missing_tables.append([table]) - print("missing tables: ", missing_tables) +def setup_connection(path: Path | str, backend_handle: DuckDBPyConnection, prefix: str = "") -> None: + """Setup a connection to the OMOP CDM database. - return None + This function sets up a connection to the OMOP CDM database. + It checks the capitalization of the 'person' table, and assumes the same capitalization style is used for all other tables. + + + Parameters + ---------- + path + The path to the folder containing the CSV files. + backend_handle + The backend handle to the database. + prefix + The prefix to be removed from the CSV filenames. + + Returns + ------- + An EHRData object with populated .uns["omop_table_capitalization"] field. + + """ + _set_up_duckdb(Path(path), backend_handle, prefix) def setup_obs( @@ -326,7 +381,7 @@ def setup_variables( if time_defining_table is None: raise ValueError("The observation table must be set up first, use the `setup_obs` function.") - if data_tables[0] in ["measurement", "observation"]: + if data_tables[0] in ["measurement", "observation", "specimen"]: # also keep unit_concept_id and unit_source_value; if isinstance(data_field_to_keep, list): data_field_to_keep = list(data_field_to_keep) + ["unit_concept_id", "unit_source_value"] @@ -359,7 +414,10 @@ def setup_variables( unit_report = _create_feature_unit_concept_id_report(backend_handle, ds) var = ds["data_table_concept_id"].to_dataframe() - concepts = backend_handle.sql("SELECT * FROM concept").df() + + if enrich_var_with_feature_info or enrich_var_with_unit_info: + concepts = backend_handle.sql("SELECT * FROM concept").df() + concepts.columns = concepts.columns.str.lower() if enrich_var_with_feature_info: var = pd.merge(var, concepts, how="left", left_index=True, right_on="concept_id") @@ -393,21 +451,6 @@ def setup_variables( return edata -def load( - backend_handle: Literal[str, duckdb, Path], - # folder_path: str, - # delimiter: str = ",", - # make_filename_lowercase: bool = True, -) -> None: - """Initialize a connection to the OMOP CDM Database.""" - if isinstance(backend_handle, str) or isinstance(backend_handle, Path): - _check_sanity_of_folder(backend_handle) - elif isinstance(backend_handle, duckdb.DuckDB): - _check_sanity_of_database(backend_handle) - else: - raise NotImplementedError(f"Backend {backend_handle} not supported. Choose a valid backend.") - - def get_table(duckdb_instance, table_name: str) -> pd.DataFrame: """Extract a table of an OMOP CDM Database.""" return _lowercase_column_names(duckdb_instance.sql(f"SELECT * FROM {table_name}").df()) diff --git a/tests/conftest.py b/tests/conftest.py index 8f5fbc0..baf5e94 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,12 +1,20 @@ import duckdb import pytest -from ehrdata.io.omop import register_omop_to_db_connection +from ehrdata.io.omop import setup_connection @pytest.fixture # (scope="session") def omop_connection_vanilla(): con = duckdb.connect() - register_omop_to_db_connection(path="tests/data/toy_omop/vanilla", backend_handle=con, source="csv") + setup_connection(path="tests/data/toy_omop/vanilla", backend_handle=con) + yield con + con.close() + + +@pytest.fixture # (scope="session") +def omop_connection_capital_letters(): + con = duckdb.connect() + setup_connection(path="tests/data/toy_omop/capital_letters", backend_handle=con) yield con con.close() diff --git a/tests/data/toy_omop/capital_letters/MEASUREMENT.csv b/tests/data/toy_omop/capital_letters/MEASUREMENT.csv new file mode 100644 index 0000000..5a548ac --- /dev/null +++ b/tests/data/toy_omop/capital_letters/MEASUREMENT.csv @@ -0,0 +1,2 @@ +MEASUREMENT_ID,PERSON_ID,MEASUREMENT_CONCEPT_ID,MEASUREMENT_DATE,MEASUREMENT_DATETIME,MEASUREMENT_TIME,MEASUREMENT_TYPE_CONCEPT_ID,OPERATOR_CONCEPT_ID,VALUE_AS_NUMBER,VALUE_AS_CONCEPT_ID,UNIT_CONCEPT_ID,RANGE_LOW,RANGE_HIGH,PROVIDER_ID,VISIT_OCCURRENCE_ID,VISIT_DETAIL_ID,MEASUREMENT_SOURCE_VALUE,MEASUREMENT_SOURCE_CONCEPT_ID,UNIT_SOURCE_VALUE,VALUE_SOURCE_VALUE +1,1,3031147,2100-01-01,2100-01-01 12:00:00,12:00:00,32856,,18,,9557,21,30,,1,,50804,2000001003,mEq/L,18 diff --git a/tests/data/toy_omop/capital_letters/OBSERVATION_PERIOD.csv b/tests/data/toy_omop/capital_letters/OBSERVATION_PERIOD.csv new file mode 100644 index 0000000..04d6e15 --- /dev/null +++ b/tests/data/toy_omop/capital_letters/OBSERVATION_PERIOD.csv @@ -0,0 +1,2 @@ +OBSERVATION_PERIOD_ID,PERSON_ID,OBSERVATION_PERIOD_START_DATE,OBSERVATION_PERIOD_END_DATE,PERIOD_TYPE_CONCEPT_ID +1,1,2100-01-01,2100-01-31,32828 diff --git a/tests/data/toy_omop/capital_letters/PERSON.csv b/tests/data/toy_omop/capital_letters/PERSON.csv new file mode 100644 index 0000000..413bedf --- /dev/null +++ b/tests/data/toy_omop/capital_letters/PERSON.csv @@ -0,0 +1,2 @@ +PERSON_ID,GENDER_CONCEPT_ID,YEAR_OF_BIRTH,MONTH_OF_BIRTH,DAY_OF_BIRTH,BIRTH_DATETIME,RACE_CONCEPT_ID,ETHNICITY_CONCEPT_ID,LOCATION_ID,PROVIDER_ID,CARE_SITE_ID,PERSON_SOURCE_VALUE,GENDER_SOURCE_VALUE,GENDER_SOURCE_CONCEPT_ID,RACE_SOURCE_VALUE,RACE_SOURCE_CONCEPT_ID,ETHNICITY_SOURCE_VALUE,ETHNICITY_SOURCE_CONCEPT_ID +1,8507,2095,,,,0,38003563,,,,1234,M,0,,,, diff --git a/tests/test_io/test_omop.py b/tests/test_io/test_omop.py index 68ed0fc..f6d4024 100644 --- a/tests/test_io/test_omop.py +++ b/tests/test_io/test_omop.py @@ -274,3 +274,30 @@ def test_setup_variables_illegal_argument_types( enrich_var_with_feature_info=enrich_var_with_feature_info, enrich_var_with_unit_info=enrich_var_with_unit_info, ) + + +def test_capital_letters(omop_connection_capital_letters): + # test capital letters both in table names and column names + con = omop_connection_capital_letters + edata = ed.io.omop.setup_obs(backend_handle=con, observation_table="person_observation_period") + edata = ed.io.omop.setup_variables( + edata, + backend_handle=con, + data_tables=["measurement"], + data_field_to_keep=["value_as_number"], + interval_length_number=1, + interval_length_unit="day", + num_intervals=1, + enrich_var_with_feature_info=False, + enrich_var_with_unit_info=False, + ) + + assert edata.r[0, 0, 0] == 18 + + tables = con.execute("SHOW TABLES").df()["name"].values + assert "measurement" in tables + assert "MEASUREMENT" not in tables + + measurement_columns = con.execute("SELECT * FROM measurement").df().columns + assert "measurement_id" in measurement_columns + assert "MEASUREMENT_ID" not in measurement_columns From 907c21fbc5119fa3f526dbf8ad6a05657a682ad1 Mon Sep 17 00:00:00 2001 From: Eljas Roellin Date: Sun, 17 Nov 2024 19:00:15 +0100 Subject: [PATCH 22/43] remove load --- docs/api.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/api.md b/docs/api.md index 2dc62aa..b209ba5 100644 --- a/docs/api.md +++ b/docs/api.md @@ -23,7 +23,6 @@ io.omop.setup_obs io.omop.setup_variables io.omop.get_time_interval_table - io.omop.load ``` ## Datasets From c91e4014e40e7a817437dc88c4f7a360f98280f3 Mon Sep 17 00:00:00 2001 From: Eljas Roellin Date: Sun, 17 Nov 2024 19:02:59 +0100 Subject: [PATCH 23/43] remove some things I forgot --- src/ehrdata/dt/datasets.py | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/src/ehrdata/dt/datasets.py b/src/ehrdata/dt/datasets.py index 152a3eb..94a6e2e 100644 --- a/src/ehrdata/dt/datasets.py +++ b/src/ehrdata/dt/datasets.py @@ -17,13 +17,6 @@ from ehrdata import EHRData -COLUMN_CASE = { - "uppercase": "uppercase", - "lowercase": "lowercase", - "titlecase": "titlecase", -} - - def _setup_eunomia_datasets( data_url: str, backend_handle: DuckDBPyConnection, @@ -41,12 +34,11 @@ def _setup_eunomia_datasets( for file_path in (data_path / nested_omop_tables_folder).glob("*.csv"): shutil.move(file_path, data_path) - edata = setup_connection( + setup_connection( data_path, backend_handle, prefix=dataset_prefix, ) - return edata def mimic_iv_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = None) -> None: @@ -81,14 +73,13 @@ def mimic_iv_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = N if data_path is None: data_path = Path("ehrapy_data/mimic-iv-demo-data-in-the-omop-common-data-model-0.9") - edata = _setup_eunomia_datasets( + _setup_eunomia_datasets( data_url=data_url, backend_handle=backend_handle, data_path=data_path, nested_omop_tables_folder="mimic-iv-demo-data-in-the-omop-common-data-model-0.9/1_omop_data_csv", dataset_prefix="2b_", ) - return edata def gibleed_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = None) -> None: @@ -122,15 +113,13 @@ def gibleed_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = No if data_path is None: data_path = Path("ehrapy_data/GiBleed_5.3") - edata = _setup_eunomia_datasets( + _setup_eunomia_datasets( data_url=data_url, backend_handle=backend_handle, data_path=data_path, nested_omop_tables_folder="GiBleed_5.3", ) - return edata - def synthea27nj_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = None) -> None: """Loads the Synthea27Nj dataset in the OMOP Common Data model. @@ -163,14 +152,12 @@ def synthea27nj_omop(backend_handle: DuckDBPyConnection, data_path: Path | None if data_path is None: data_path = Path("ehrapy_data/Synthea27Nj_5.4") - edata = _setup_eunomia_datasets( + _setup_eunomia_datasets( data_url=data_url, backend_handle=backend_handle, data_path=data_path, ) - return edata - def mimic_ii(backend_handle: DuckDBPyConnection, data_path: Path | None = None) -> None: """Loads the MIMIC2 dataset.""" From a7c7af287e4a3de8ad7db675ff53b1573726ce05 Mon Sep 17 00:00:00 2001 From: Eljas Roellin Date: Sun, 17 Nov 2024 19:21:06 +0100 Subject: [PATCH 24/43] move validity checks to separate file --- src/ehrdata/io/omop/_check_arguments.py | 94 ++++++++++++++++++++++ src/ehrdata/io/omop/omop.py | 102 ++++-------------------- 2 files changed, 111 insertions(+), 85 deletions(-) create mode 100644 src/ehrdata/io/omop/_check_arguments.py diff --git a/src/ehrdata/io/omop/_check_arguments.py b/src/ehrdata/io/omop/_check_arguments.py new file mode 100644 index 0000000..ca4d753 --- /dev/null +++ b/src/ehrdata/io/omop/_check_arguments.py @@ -0,0 +1,94 @@ +from __future__ import annotations + +from collections.abc import Sequence + +import duckdb + +from ehrdata.io.omop._queries import ( + AGGREGATION_STRATEGY_KEY, +) + +DOWNLOAD_VERIFICATION_TAG = "download_verification_tag" +VALID_OBSERVATION_TABLES_SINGLE = ["person"] +VALID_OBSERVATION_TABLES_JOIN = ["person_cohort", "person_observation_period", "person_visit_occurrence"] +VALID_VARIABLE_TABLES = ["measurement", "observation", "specimen"] + + +def _check_valid_backend_handle(backend_handle) -> None: + if not isinstance(backend_handle, duckdb.duckdb.DuckDBPyConnection): + raise TypeError("Expected backend_handle to be of type DuckDBPyConnection.") + + +def _check_valid_observation_table(observation_table) -> None: + if not isinstance(observation_table, str): + raise TypeError("Expected observation_table to be a string.") + if observation_table not in VALID_OBSERVATION_TABLES_SINGLE + VALID_OBSERVATION_TABLES_JOIN: + raise ValueError( + f"observation_table must be one of {VALID_OBSERVATION_TABLES_SINGLE+VALID_OBSERVATION_TABLES_JOIN}." + ) + + +def _check_valid_death_table(death_table) -> None: + if not isinstance(death_table, bool): + raise TypeError("Expected death_table to be a boolean.") + + +def _check_valid_edata(edata) -> None: + from ehrdata import EHRData + + if not isinstance(edata, EHRData): + raise TypeError("Expected edata to be of type EHRData.") + + +def _check_valid_data_tables(data_tables) -> Sequence: + if isinstance(data_tables, str): + data_tables = [data_tables] + if not isinstance(data_tables, Sequence): + raise TypeError("Expected data_tables to be a string or Sequence.") + if not all(table in VALID_VARIABLE_TABLES for table in data_tables): + raise ValueError(f"data_tables must be a subset of {VALID_VARIABLE_TABLES}.") + return data_tables + + +def _check_valid_data_field_to_keep(data_field_to_keep) -> Sequence: + if isinstance(data_field_to_keep, str): + data_field_to_keep = [data_field_to_keep] + if not isinstance(data_field_to_keep, Sequence): + raise TypeError("Expected data_field_to_keep to be a string or Sequence.") + return data_field_to_keep + + +def _check_valid_interval_length_number(interval_length_number) -> None: + if not isinstance(interval_length_number, int): + raise TypeError("Expected interval_length_number to be an integer.") + + +def _check_valid_interval_length_unit(interval_length_unit) -> None: + # TODO: maybe check if it is a valid unit from pandas.to_timedelta + if not isinstance(interval_length_unit, str): + raise TypeError("Expected interval_length_unit to be a string.") + + +def _check_valid_num_intervals(num_intervals) -> None: + if not isinstance(num_intervals, int): + raise TypeError("Expected num_intervals to be an integer.") + + +def _check_valid_concept_ids(concept_ids) -> None: + if concept_ids != "all" and not isinstance(concept_ids, Sequence): + raise TypeError("concept_ids must be a sequence of integers or 'all'.") + + +def _check_valid_aggregation_strategy(aggregation_strategy) -> None: + if aggregation_strategy not in AGGREGATION_STRATEGY_KEY.keys(): + raise TypeError(f"aggregation_strategy must be one of {AGGREGATION_STRATEGY_KEY.keys()}.") + + +def _check_valid_enrich_var_with_feature_info(enrich_var_with_feature_info) -> None: + if not isinstance(enrich_var_with_feature_info, bool): + raise TypeError("Expected enrich_var_with_feature_info to be a boolean.") + + +def _check_valid_enrich_var_with_unit_info(enrich_var_with_unit_info) -> None: + if not isinstance(enrich_var_with_unit_info, bool): + raise TypeError("Expected enrich_var_with_unit_info to be a boolean.") diff --git a/src/ehrdata/io/omop/omop.py b/src/ehrdata/io/omop/omop.py index 3d6efb6..2c3d9df 100644 --- a/src/ehrdata/io/omop/omop.py +++ b/src/ehrdata/io/omop/omop.py @@ -12,18 +12,30 @@ import pandas as pd from duckdb.duckdb import DuckDBPyConnection +from ehrdata.io.omop._check_arguments import ( + VALID_OBSERVATION_TABLES_JOIN, + VALID_OBSERVATION_TABLES_SINGLE, + _check_valid_aggregation_strategy, + _check_valid_backend_handle, + _check_valid_concept_ids, + _check_valid_data_field_to_keep, + _check_valid_data_tables, + _check_valid_death_table, + _check_valid_edata, + _check_valid_enrich_var_with_feature_info, + _check_valid_enrich_var_with_unit_info, + _check_valid_interval_length_number, + _check_valid_interval_length_unit, + _check_valid_num_intervals, + _check_valid_observation_table, +) from ehrdata.io.omop._queries import ( - AGGREGATION_STRATEGY_KEY, time_interval_table_query_long_format, ) from ehrdata.utils._omop_utils import get_table_catalog_dict DOWNLOAD_VERIFICATION_TAG = "download_verification_tag" -VALID_OBSERVATION_TABLES_SINGLE = ["person"] -VALID_OBSERVATION_TABLES_JOIN = ["person_cohort", "person_observation_period", "person_visit_occurrence"] -VALID_VARIABLE_TABLES = ["measurement", "observation", "specimen"] - def _get_table_list() -> list: flat_table_list = [] @@ -84,86 +96,6 @@ def _set_up_duckdb(path: Path, backend_handle: DuckDBPyConnection, prefix: str = logging.info(f"unused files: {unused_files}") -def _check_valid_backend_handle(backend_handle) -> None: - if not isinstance(backend_handle, duckdb.duckdb.DuckDBPyConnection): - raise TypeError("Expected backend_handle to be of type DuckDBPyConnection.") - - -def _check_valid_observation_table(observation_table) -> None: - if not isinstance(observation_table, str): - raise TypeError("Expected observation_table to be a string.") - if observation_table not in VALID_OBSERVATION_TABLES_SINGLE + VALID_OBSERVATION_TABLES_JOIN: - raise ValueError( - f"observation_table must be one of {VALID_OBSERVATION_TABLES_SINGLE+VALID_OBSERVATION_TABLES_JOIN}." - ) - - -def _check_valid_death_table(death_table) -> None: - if not isinstance(death_table, bool): - raise TypeError("Expected death_table to be a boolean.") - - -def _check_valid_edata(edata) -> None: - from ehrdata import EHRData - - if not isinstance(edata, EHRData): - raise TypeError("Expected edata to be of type EHRData.") - - -def _check_valid_data_tables(data_tables) -> Sequence: - if isinstance(data_tables, str): - data_tables = [data_tables] - if not isinstance(data_tables, Sequence): - raise TypeError("Expected data_tables to be a string or Sequence.") - if not all(table in VALID_VARIABLE_TABLES for table in data_tables): - raise ValueError(f"data_tables must be a subset of {VALID_VARIABLE_TABLES}.") - return data_tables - - -def _check_valid_data_field_to_keep(data_field_to_keep) -> Sequence: - if isinstance(data_field_to_keep, str): - data_field_to_keep = [data_field_to_keep] - if not isinstance(data_field_to_keep, Sequence): - raise TypeError("Expected data_field_to_keep to be a string or Sequence.") - return data_field_to_keep - - -def _check_valid_interval_length_number(interval_length_number) -> None: - if not isinstance(interval_length_number, int): - raise TypeError("Expected interval_length_number to be an integer.") - - -def _check_valid_interval_length_unit(interval_length_unit) -> None: - # TODO: maybe check if it is a valid unit from pandas.to_timedelta - if not isinstance(interval_length_unit, str): - raise TypeError("Expected interval_length_unit to be a string.") - - -def _check_valid_num_intervals(num_intervals) -> None: - if not isinstance(num_intervals, int): - raise TypeError("Expected num_intervals to be an integer.") - - -def _check_valid_concept_ids(concept_ids) -> None: - if concept_ids != "all" and not isinstance(concept_ids, Sequence): - raise TypeError("concept_ids must be a sequence of integers or 'all'.") - - -def _check_valid_aggregation_strategy(aggregation_strategy) -> None: - if aggregation_strategy not in AGGREGATION_STRATEGY_KEY.keys(): - raise TypeError(f"aggregation_strategy must be one of {AGGREGATION_STRATEGY_KEY.keys()}.") - - -def _check_valid_enrich_var_with_feature_info(enrich_var_with_feature_info) -> None: - if not isinstance(enrich_var_with_feature_info, bool): - raise TypeError("Expected enrich_var_with_feature_info to be a boolean.") - - -def _check_valid_enrich_var_with_unit_info(enrich_var_with_unit_info) -> None: - if not isinstance(enrich_var_with_unit_info, bool): - raise TypeError("Expected enrich_var_with_unit_info to be a boolean.") - - def _collect_units_per_feature(ds, unit_key="unit_concept_id") -> dict: feature_units = {} for i in range(ds[unit_key].shape[1]): From 09b0c34a9cb259ff8c2fc6e7935dd7800998cd22 Mon Sep 17 00:00:00 2001 From: Eljas Roellin Date: Sun, 17 Nov 2024 21:54:48 +0100 Subject: [PATCH 25/43] enable and vanilla test for specimen table --- tests/data/toy_omop/vanilla/specimen.csv | 10 ++++++++++ tests/test_io/test_omop.py | 13 +++++++------ 2 files changed, 17 insertions(+), 6 deletions(-) create mode 100644 tests/data/toy_omop/vanilla/specimen.csv diff --git a/tests/data/toy_omop/vanilla/specimen.csv b/tests/data/toy_omop/vanilla/specimen.csv new file mode 100644 index 0000000..ada93dd --- /dev/null +++ b/tests/data/toy_omop/vanilla/specimen.csv @@ -0,0 +1,10 @@ +specimen_id,person_id,specimen_concept_id,specimen_type_concept_id,specimen_date,specimen_datetime,quantity,unit_concept_id,anatomic_site_concept_id,disease_status_concept_id,specimen_source_id,specimen_source_value,unit_source_value,anatomic_site_source_value,disease_status_source_value +1,1,4001225,32856,2100-01-01,2100-01-01 12:00:00,0.5,,0,0,,70012,,, +2,1,4001225,32856,2100-01-01,2100-01-01 13:00:00,0.5,,0,0,,70012,,, +3,1,4121345,32856,2100-01-01,2100-01-01 14:00:00,1.5,,0,0,,70021,,, +4,2,4001225,32856,2100-01-01,2100-01-01 12:00:00,0.5,,0,0,,70012,,, +5,2,4001225,32856,2100-01-01,2100-01-01 13:00:00,0.5,,0,0,,70012,,, +6,2,4121345,32856,2100-01-01,2100-01-01 14:00:00,1.5,,0,0,,70021,,, +7,3,4001225,32856,2100-01-01,2100-01-01 12:00:00,0.5,,0,0,,70012,,, +8,3,4001225,32856,2100-01-01,2100-01-01 13:00:00,0.5,,0,0,,70012,,, +9,3,4121345,32856,2100-01-01,2100-01-01 14:00:00,1.5,,0,0,,70021,,, diff --git a/tests/test_io/test_omop.py b/tests/test_io/test_omop.py index f6d4024..81ae099 100644 --- a/tests/test_io/test_omop.py +++ b/tests/test_io/test_omop.py @@ -13,6 +13,7 @@ VANILLA_NUM_CONCEPTS = { "measurement": 2, "observation": 2, + "specimen": 2, } # constants for setup_variables @@ -88,12 +89,12 @@ def test_setup_obs_invalid_observation_table_value(omop_connection_vanilla): ["person_cohort", "person_observation_period", "person_visit_occurrence"], ) @pytest.mark.parametrize( - "data_tables", - [["measurement"], ["observation"]], -) -@pytest.mark.parametrize( - "data_field_to_keep", - [["value_as_number"], ["value_as_concept_id"]], + "data_tables,data_field_to_keep", + [ + (["measurement"], ["value_as_number", "value_as_concept_id"]), + (["observation"], ["value_as_number", "value_as_concept_id"]), + (["specimen"], ["quantity"]), + ], ) @pytest.mark.parametrize( "enrich_var_with_feature_info", From 6854159718504a11637e87a8e2031103067e66de Mon Sep 17 00:00:00 2001 From: Eljas Roellin Date: Mon, 18 Nov 2024 12:15:20 +0100 Subject: [PATCH 26/43] escape the % in duckdb's read_csv --- src/ehrdata/io/omop/omop.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ehrdata/io/omop/omop.py b/src/ehrdata/io/omop/omop.py index 2c3d9df..efd5bdd 100644 --- a/src/ehrdata/io/omop/omop.py +++ b/src/ehrdata/io/omop/omop.py @@ -65,7 +65,7 @@ def _set_up_duckdb(path: Path, backend_handle: DuckDBPyConnection, prefix: str = dtype = None # read raw csv as temporary table - temp_relation = backend_handle.read_csv(path / file_name, dtype=dtype) # noqa: F841 + temp_relation = backend_handle.read_csv(path / file_name, dtype=dtype, escapechar="%") # noqa: F841 backend_handle.execute("CREATE OR REPLACE TABLE temp_table AS SELECT * FROM temp_relation") # make query to create table with lowercase column names From fcd641b3aeec3b52e3132a2c938435dd8f8b35d2 Mon Sep 17 00:00:00 2001 From: Eljas Roellin Date: Mon, 18 Nov 2024 15:38:42 +0100 Subject: [PATCH 27/43] add robustness: empty observation,measurement,speciment acceptable --- src/ehrdata/io/omop/omop.py | 152 ++++++++++++++++++++++++++++++++++++ tests/conftest.py | 8 ++ tests/test_io/test_omop.py | 18 +++++ 3 files changed, 178 insertions(+) diff --git a/src/ehrdata/io/omop/omop.py b/src/ehrdata/io/omop/omop.py index efd5bdd..344fdac 100644 --- a/src/ehrdata/io/omop/omop.py +++ b/src/ehrdata/io/omop/omop.py @@ -325,6 +325,158 @@ def setup_variables( else: raise ValueError + # dbms complains about our queries, which sometimes need a column to be of type e.g. datetime, when it can't infer types from data + count = backend_handle.execute(f"SELECT COUNT(*) as count FROM {data_tables[0]}").df()["count"].item() + if count == 0: + logging.info(f"No data found in {data_tables[0]}. Returning edata without additional variables.") + return edata + + ds = ( + time_interval_table_query_long_format( + backend_handle=backend_handle, + time_defining_table=time_defining_table, + data_table=data_tables[0], + data_field_to_keep=data_field_to_keep, + interval_length_number=interval_length_number, + interval_length_unit=interval_length_unit, + num_intervals=num_intervals, + aggregation_strategy=aggregation_strategy, + ) + .set_index(["person_id", "data_table_concept_id", "interval_step"]) + .to_xarray() + ) + + _check_one_unit_per_feature(ds) + # TODO ignore? go with more vanilla omop style. _check_one_unit_per_feature(ds, unit_key="unit_source_value") + + unit_report = _create_feature_unit_concept_id_report(backend_handle, ds) + + var = ds["data_table_concept_id"].to_dataframe() + + if enrich_var_with_feature_info or enrich_var_with_unit_info: + concepts = backend_handle.sql("SELECT * FROM concept").df() + concepts.columns = concepts.columns.str.lower() + + if enrich_var_with_feature_info: + var = pd.merge(var, concepts, how="left", left_index=True, right_on="concept_id") + + if enrich_var_with_unit_info: + if unit_report["multiple_units"].sum() > 0: + raise ValueError("Multiple units per feature found. Enrichment with feature information not possible.") + else: + var = pd.merge( + var, + unit_report, + how="left", + left_index=True, + right_on="unit_concept_id", + suffixes=("", "_unit"), + ) + var = pd.merge( + var, + concepts, + how="left", + left_on="unit_concept_id", + right_on="concept_id", + suffixes=("", "_unit"), + ) + + t = ds["interval_step"].to_dataframe() + + edata = EHRData(r=ds[data_field_to_keep[0]].values, obs=edata.obs, var=var, uns=edata.uns, t=t) + edata.uns[f"unit_report_{data_tables[0]}"] = unit_report + + return edata + + +def setup_interval_variables( + edata, + *, + backend_handle: duckdb.duckdb.DuckDBPyConnection, + data_tables: Sequence[Literal["drug_exposure"]] | Literal["drug_exposure"], + data_field_to_keep: str | Sequence[str], + interval_length_number: int, + interval_length_unit: str, + num_intervals: int, + concept_ids: Literal["all"] | Sequence = "all", + aggregation_strategy: str = "last", + enrich_var_with_feature_info: bool = False, + enrich_var_with_unit_info: bool = False, + keep_start_date_only: bool = False, +): + """Setup the interval variables + + This function sets up the variables that are stored as interval in OMOP for the EHRData object. + It will fail if there is more than one unit_concept_id per feature. + Writes a unit report of the features to edata.uns["unit_report_"]. + + Parameters + ---------- + backend_handle + The backend handle to the database. + edata + The EHRData object to which the variables should be added. + data_tables + The table to be used. Only a single table can be used. + data_field_to_keep + The CDM Field in the data table to be kept. Can be e.g. "value_as_number" or "value_as_concept_id". + start_time + Starting time for values to be included. + interval_length_number + Numeric value of the length of one interval. + interval_length_unit + Unit belonging to the interval length. + num_intervals + Number of intervals. + concept_ids + Concept IDs to use from this data table. If not specified, 'all' are used. + aggregation_strategy + Strategy to use when aggregating multiple data points within one interval. + enrich_var_with_feature_info + Whether to enrich the var table with feature information. If a concept_id is not found in the concept table, the feature information will be NaN. + enrich_var_with_unit_info + Whether to enrich the var table with unit information. Raises an Error if a) multiple units per feature are found for at least one feature. If a concept_id is not found in the concept table, the feature information will be NaN. + + Returns + ------- + An EHRData object with populated .r and .var field. + """ + from ehrdata import EHRData + + _check_valid_edata(edata) + _check_valid_backend_handle(backend_handle) + data_tables = _check_valid_data_tables(data_tables) + data_field_to_keep = _check_valid_data_field_to_keep(data_field_to_keep) + _check_valid_interval_length_number(interval_length_number) + _check_valid_interval_length_unit(interval_length_unit) + _check_valid_num_intervals(num_intervals) + _check_valid_concept_ids(concept_ids) + _check_valid_aggregation_strategy(aggregation_strategy) + _check_valid_enrich_var_with_feature_info(enrich_var_with_feature_info) + _check_valid_enrich_var_with_unit_info(enrich_var_with_unit_info) + + time_defining_table = edata.uns.get("omop_io_observation_table", None) + if time_defining_table is None: + raise ValueError("The observation table must be set up first, use the `setup_obs` function.") + + if data_tables[0] in ["drug_exposure"]: + # also keep unit_concept_id and unit_source_value; + if isinstance(data_field_to_keep, list): + data_field_to_keep = list(data_field_to_keep) + ["unit_concept_id", "unit_source_value"] + # TODO: use in future version when more than one data table can be used + # elif isinstance(data_field_to_keep, dict): + # data_field_to_keep = { + # k: v + ["unit_concept_id", "unit_source_value"] for k, v in data_field_to_keep.items() + # } + else: + raise ValueError + + # dbms complains about our queries, which sometimes need a column to be of type e.g. datetime, when it can't infer types from data + count = backend_handle.execute(f"SELECT COUNT(*) as count FROM {data_tables}").df()["count"].item() + if count == 0: + logging.info(f"No data in {data_tables}.") + return edata + ds = ( time_interval_table_query_long_format( backend_handle=backend_handle, diff --git a/tests/conftest.py b/tests/conftest.py index baf5e94..a42fcb1 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -18,3 +18,11 @@ def omop_connection_capital_letters(): setup_connection(path="tests/data/toy_omop/capital_letters", backend_handle=con) yield con con.close() + + +@pytest.fixture # (scope="session") +def omop_connection_empty_observation(): + con = duckdb.connect() + setup_connection(path="tests/data/toy_omop/empty_observation", backend_handle=con) + yield con + con.close() diff --git a/tests/test_io/test_omop.py b/tests/test_io/test_omop.py index 81ae099..01c995a 100644 --- a/tests/test_io/test_omop.py +++ b/tests/test_io/test_omop.py @@ -302,3 +302,21 @@ def test_capital_letters(omop_connection_capital_letters): measurement_columns = con.execute("SELECT * FROM measurement").df().columns assert "measurement_id" in measurement_columns assert "MEASUREMENT_ID" not in measurement_columns + + +def test_empty_observation(omop_connection_empty_observation, caplog): + con = omop_connection_empty_observation + edata = ed.io.omop.setup_obs(backend_handle=con, observation_table="person") + edata = ed.io.omop.setup_variables( + edata, + backend_handle=con, + data_tables=["observation"], + data_field_to_keep=["value_as_number"], + interval_length_number=1, + interval_length_unit="day", + num_intervals=1, + enrich_var_with_feature_info=False, + enrich_var_with_unit_info=False, + ) + assert edata.shape == (1, 0) + assert "No data found in observation. Returning edata without additional variables." in caplog.text From a390b4cb06cba43cf0bced6f897b3ce636bd5e7a Mon Sep 17 00:00:00 2001 From: Eljas Roellin Date: Mon, 18 Nov 2024 16:16:47 +0100 Subject: [PATCH 28/43] add empty_observation test data --- tests/data/toy_omop/empty_observation/observation.csv | 1 + tests/data/toy_omop/empty_observation/person.csv | 2 ++ 2 files changed, 3 insertions(+) create mode 100644 tests/data/toy_omop/empty_observation/observation.csv create mode 100644 tests/data/toy_omop/empty_observation/person.csv diff --git a/tests/data/toy_omop/empty_observation/observation.csv b/tests/data/toy_omop/empty_observation/observation.csv new file mode 100644 index 0000000..ad1a438 --- /dev/null +++ b/tests/data/toy_omop/empty_observation/observation.csv @@ -0,0 +1 @@ +observation_id,person_id,observation_concept_id,observation_date,observation_datetime,observation_type_concept_id,value_as_number,value_as_string,value_as_concept_id,qualifier_concept_id,unit_concept_id,provider_id,visit_occurrence_id,visit_detail_id,observation_source_value,observation_source_concept_id,unit_source_value,qualifier_source_value diff --git a/tests/data/toy_omop/empty_observation/person.csv b/tests/data/toy_omop/empty_observation/person.csv new file mode 100644 index 0000000..0f13db9 --- /dev/null +++ b/tests/data/toy_omop/empty_observation/person.csv @@ -0,0 +1,2 @@ +person_id,gender_concept_id,year_of_birth,month_of_birth,day_of_birth,birth_datetime,race_concept_id,ethnicity_concept_id,location_id,provider_id,care_site_id,person_source_value,gender_source_value,gender_source_concept_id,race_source_value,race_source_concept_id,ethnicity_source_value,ethnicity_source_concept_id +1,8507,2095,,,,0,38003563,,,,1234,M,0,,,, From adcf18676d5e5bda87583188ba36f1525f843c36 Mon Sep 17 00:00:00 2001 From: Eljas Roellin Date: Mon, 18 Nov 2024 17:50:52 +0100 Subject: [PATCH 29/43] add support for start or end of e.g. drug_exposure --- src/ehrdata/io/omop/__init__.py | 1 + src/ehrdata/io/omop/_check_arguments.py | 21 ++++++- src/ehrdata/io/omop/_queries.py | 17 ++++- src/ehrdata/io/omop/omop.py | 63 +++++-------------- tests/data/toy_omop/vanilla/drug_exposure.csv | 10 +++ tests/test_io/test_omop.py | 51 +++++++++++++++ 6 files changed, 112 insertions(+), 51 deletions(-) create mode 100644 tests/data/toy_omop/vanilla/drug_exposure.csv diff --git a/src/ehrdata/io/omop/__init__.py b/src/ehrdata/io/omop/__init__.py index 6fb9860..d6be480 100644 --- a/src/ehrdata/io/omop/__init__.py +++ b/src/ehrdata/io/omop/__init__.py @@ -13,6 +13,7 @@ # extract_procedure_occurrence, # extract_specimen, setup_connection, + setup_interval_variables, setup_obs, setup_variables, ) diff --git a/src/ehrdata/io/omop/_check_arguments.py b/src/ehrdata/io/omop/_check_arguments.py index ca4d753..8b145cf 100644 --- a/src/ehrdata/io/omop/_check_arguments.py +++ b/src/ehrdata/io/omop/_check_arguments.py @@ -12,6 +12,8 @@ VALID_OBSERVATION_TABLES_SINGLE = ["person"] VALID_OBSERVATION_TABLES_JOIN = ["person_cohort", "person_observation_period", "person_visit_occurrence"] VALID_VARIABLE_TABLES = ["measurement", "observation", "specimen"] +VALID_INTERVAL_VARIABLE_TABLES = ["drug_exposure"] +VALID_KEEP_DATES = ["start", "end", "interval"] def _check_valid_backend_handle(backend_handle) -> None: @@ -40,7 +42,7 @@ def _check_valid_edata(edata) -> None: raise TypeError("Expected edata to be of type EHRData.") -def _check_valid_data_tables(data_tables) -> Sequence: +def _check_valid_variable_data_tables(data_tables) -> Sequence: if isinstance(data_tables, str): data_tables = [data_tables] if not isinstance(data_tables, Sequence): @@ -50,6 +52,16 @@ def _check_valid_data_tables(data_tables) -> Sequence: return data_tables +def _check_valid_interval_variable_data_tables(data_tables) -> Sequence: + if isinstance(data_tables, str): + data_tables = [data_tables] + if not isinstance(data_tables, Sequence): + raise TypeError("Expected data_tables to be a string or Sequence.") + if not all(table in VALID_INTERVAL_VARIABLE_TABLES for table in data_tables): + raise ValueError(f"data_tables must be a subset of {VALID_INTERVAL_VARIABLE_TABLES}.") + return data_tables + + def _check_valid_data_field_to_keep(data_field_to_keep) -> Sequence: if isinstance(data_field_to_keep, str): data_field_to_keep = [data_field_to_keep] @@ -92,3 +104,10 @@ def _check_valid_enrich_var_with_feature_info(enrich_var_with_feature_info) -> N def _check_valid_enrich_var_with_unit_info(enrich_var_with_unit_info) -> None: if not isinstance(enrich_var_with_unit_info, bool): raise TypeError("Expected enrich_var_with_unit_info to be a boolean.") + + +def _check_valid_keep_date(keep_date: str) -> None: + if not isinstance(keep_date, str): + raise TypeError("Expected keep_date to be a string.") + if keep_date not in VALID_KEEP_DATES: + raise ValueError(f"keep_date must be one of {VALID_KEEP_DATES}.") diff --git a/src/ehrdata/io/omop/_queries.py b/src/ehrdata/io/omop/_queries.py index abdbf80..2975231 100644 --- a/src/ehrdata/io/omop/_queries.py +++ b/src/ehrdata/io/omop/_queries.py @@ -19,6 +19,13 @@ "cohort": "subject_id", } +DATA_TABLE_CONCEPT_ID_TRUNK = { + "measurement": "measurement", + "observation": "observation", + "specimen": "specimen", + "drug_exposure": "drug", +} + AGGREGATION_STRATEGY_KEY = { "last": "LAST", "first": "FIRST", @@ -83,11 +90,15 @@ def time_interval_table_query_long_format( num_intervals: int, aggregation_strategy: str, data_field_to_keep: Sequence[str] | str, + date_prefix: str = "", ) -> pd.DataFrame: """Returns a long format DataFrame from the data_table. The following columns should be considered the indices of this long format: person_id, data_table_concept_id, interval_step. The other columns, except for start_date and end_date, should be considered the values.""" if isinstance(data_field_to_keep, str): data_field_to_keep = [data_field_to_keep] + if date_prefix != "": + date_prefix = date_prefix + "_" + timedeltas_dataframe = _generate_timedeltas(interval_length_number, interval_length_unit, num_intervals) _write_timedeltas_to_db( @@ -110,10 +121,10 @@ def time_interval_table_query_long_format( ), \ person_data_table AS( \ WITH distinct_data_table_concept_ids AS ( \ - SELECT DISTINCT {data_table}_concept_id + SELECT DISTINCT {DATA_TABLE_CONCEPT_ID_TRUNK[data_table]}_concept_id FROM {data_table} \ ) - SELECT person.person_id, {data_table}_concept_id as data_table_concept_id \ + SELECT person.person_id, {DATA_TABLE_CONCEPT_ID_TRUNK[data_table]}_concept_id as data_table_concept_id \ FROM person \ CROSS JOIN distinct_data_table_concept_ids \ ), \ @@ -129,7 +140,7 @@ def time_interval_table_query_long_format( ) \ SELECT lfi.person_id, lfi.data_table_concept_id, interval_step, interval_start, interval_end, {_generate_value_query(data_table, data_field_to_keep, AGGREGATION_STRATEGY_KEY[aggregation_strategy])} \ FROM long_format_intervals as lfi \ - LEFT JOIN {data_table} ON lfi.person_id = {data_table}.person_id AND lfi.data_table_concept_id = {data_table}.{data_table}_concept_id AND {data_table}.{data_table}_date BETWEEN lfi.interval_start AND lfi.interval_end \ + LEFT JOIN {data_table} ON lfi.person_id = {data_table}.person_id AND lfi.data_table_concept_id = {data_table}.{DATA_TABLE_CONCEPT_ID_TRUNK[data_table]}_concept_id AND {data_table}.{data_table}_{date_prefix}date BETWEEN lfi.interval_start AND lfi.interval_end \ GROUP BY lfi.person_id, lfi.data_table_concept_id, interval_step, interval_start, interval_end """ ).df() diff --git a/src/ehrdata/io/omop/omop.py b/src/ehrdata/io/omop/omop.py index 344fdac..876b9d2 100644 --- a/src/ehrdata/io/omop/omop.py +++ b/src/ehrdata/io/omop/omop.py @@ -19,15 +19,17 @@ _check_valid_backend_handle, _check_valid_concept_ids, _check_valid_data_field_to_keep, - _check_valid_data_tables, _check_valid_death_table, _check_valid_edata, _check_valid_enrich_var_with_feature_info, _check_valid_enrich_var_with_unit_info, _check_valid_interval_length_number, _check_valid_interval_length_unit, + _check_valid_interval_variable_data_tables, + _check_valid_keep_date, _check_valid_num_intervals, _check_valid_observation_table, + _check_valid_variable_data_tables, ) from ehrdata.io.omop._queries import ( time_interval_table_query_long_format, @@ -299,7 +301,7 @@ def setup_variables( _check_valid_edata(edata) _check_valid_backend_handle(backend_handle) - data_tables = _check_valid_data_tables(data_tables) + data_tables = _check_valid_variable_data_tables(data_tables) data_field_to_keep = _check_valid_data_field_to_keep(data_field_to_keep) _check_valid_interval_length_number(interval_length_number) _check_valid_interval_length_unit(interval_length_unit) @@ -394,7 +396,7 @@ def setup_interval_variables( *, backend_handle: duckdb.duckdb.DuckDBPyConnection, data_tables: Sequence[Literal["drug_exposure"]] | Literal["drug_exposure"], - data_field_to_keep: str | Sequence[str], + data_field_to_keep: str | Sequence[str] | Literal["one-hot"], interval_length_number: int, interval_length_unit: str, num_intervals: int, @@ -402,7 +404,7 @@ def setup_interval_variables( aggregation_strategy: str = "last", enrich_var_with_feature_info: bool = False, enrich_var_with_unit_info: bool = False, - keep_start_date_only: bool = False, + keep_date: Literal["start", "end", "interval"] = "start", ): """Setup the interval variables @@ -434,8 +436,8 @@ def setup_interval_variables( Strategy to use when aggregating multiple data points within one interval. enrich_var_with_feature_info Whether to enrich the var table with feature information. If a concept_id is not found in the concept table, the feature information will be NaN. - enrich_var_with_unit_info - Whether to enrich the var table with unit information. Raises an Error if a) multiple units per feature are found for at least one feature. If a concept_id is not found in the concept table, the feature information will be NaN. + keep_date + Whether to keep the start or end date, or the interval span. Returns ------- @@ -445,7 +447,7 @@ def setup_interval_variables( _check_valid_edata(edata) _check_valid_backend_handle(backend_handle) - data_tables = _check_valid_data_tables(data_tables) + data_tables = _check_valid_interval_variable_data_tables(data_tables) data_field_to_keep = _check_valid_data_field_to_keep(data_field_to_keep) _check_valid_interval_length_number(interval_length_number) _check_valid_interval_length_unit(interval_length_unit) @@ -454,29 +456,22 @@ def setup_interval_variables( _check_valid_aggregation_strategy(aggregation_strategy) _check_valid_enrich_var_with_feature_info(enrich_var_with_feature_info) _check_valid_enrich_var_with_unit_info(enrich_var_with_unit_info) + _check_valid_keep_date(keep_date) time_defining_table = edata.uns.get("omop_io_observation_table", None) if time_defining_table is None: raise ValueError("The observation table must be set up first, use the `setup_obs` function.") - if data_tables[0] in ["drug_exposure"]: - # also keep unit_concept_id and unit_source_value; - if isinstance(data_field_to_keep, list): - data_field_to_keep = list(data_field_to_keep) + ["unit_concept_id", "unit_source_value"] - # TODO: use in future version when more than one data table can be used - # elif isinstance(data_field_to_keep, dict): - # data_field_to_keep = { - # k: v + ["unit_concept_id", "unit_source_value"] for k, v in data_field_to_keep.items() - # } - else: - raise ValueError - # dbms complains about our queries, which sometimes need a column to be of type e.g. datetime, when it can't infer types from data - count = backend_handle.execute(f"SELECT COUNT(*) as count FROM {data_tables}").df()["count"].item() + count = backend_handle.execute(f"SELECT COUNT(*) as count FROM {data_tables[0]}").df()["count"].item() if count == 0: logging.info(f"No data in {data_tables}.") return edata + if keep_date == "start" or keep_date == "end": + date_prefix = keep_date + else: + raise NotImplementedError("support interval extraction coming soon") ds = ( time_interval_table_query_long_format( backend_handle=backend_handle, @@ -487,16 +482,12 @@ def setup_interval_variables( interval_length_unit=interval_length_unit, num_intervals=num_intervals, aggregation_strategy=aggregation_strategy, + date_prefix=date_prefix, ) .set_index(["person_id", "data_table_concept_id", "interval_step"]) .to_xarray() ) - _check_one_unit_per_feature(ds) - # TODO ignore? go with more vanilla omop style. _check_one_unit_per_feature(ds, unit_key="unit_source_value") - - unit_report = _create_feature_unit_concept_id_report(backend_handle, ds) - var = ds["data_table_concept_id"].to_dataframe() if enrich_var_with_feature_info or enrich_var_with_unit_info: @@ -506,31 +497,9 @@ def setup_interval_variables( if enrich_var_with_feature_info: var = pd.merge(var, concepts, how="left", left_index=True, right_on="concept_id") - if enrich_var_with_unit_info: - if unit_report["multiple_units"].sum() > 0: - raise ValueError("Multiple units per feature found. Enrichment with feature information not possible.") - else: - var = pd.merge( - var, - unit_report, - how="left", - left_index=True, - right_on="unit_concept_id", - suffixes=("", "_unit"), - ) - var = pd.merge( - var, - concepts, - how="left", - left_on="unit_concept_id", - right_on="concept_id", - suffixes=("", "_unit"), - ) - t = ds["interval_step"].to_dataframe() edata = EHRData(r=ds[data_field_to_keep[0]].values, obs=edata.obs, var=var, uns=edata.uns, t=t) - edata.uns[f"unit_report_{data_tables[0]}"] = unit_report return edata diff --git a/tests/data/toy_omop/vanilla/drug_exposure.csv b/tests/data/toy_omop/vanilla/drug_exposure.csv new file mode 100644 index 0000000..b8d81dc --- /dev/null +++ b/tests/data/toy_omop/vanilla/drug_exposure.csv @@ -0,0 +1,10 @@ +drug_exposure_id,person_id,drug_concept_id,drug_exposure_start_date,drug_exposure_start_datetime,drug_exposure_end_date,drug_exposure_end_datetime,verbatim_end_date,drug_type_concept_id,stop_reason,refills,quantity,days_supply,sig,route_concept_id,lot_number,provider_id,visit_occurrence_id,visit_detail_id,drug_source_value,drug_source_concept_id,route_source_value,dose_unit_source_value +1,1,19073183,2100-01-01,2100-01-01 12:00:00,2100-01-31,2100-01-31 00:00:00,2100-01-31,32869,,0,0,31,,0,0,10,1,,308182,19073183,, +2,1,19073183,2100-02-01,2100-02-01 12:00:00,2100-02-28,2100-02-28 00:00:00,2100-02-28,32869,,0,0,28,,0,0,10,1,,308182,19073183,, +3,1,19019979,2100-01-01,2100-01-01 12:00:00,2100-01-31,2100-01-31 00:00:00,2100-01-31,32869,,0,0,31,,0,0,10,1,,198405,19019979,, +4,2,19073183,2100-01-01,2100-01-01 12:00:00,2100-01-31,2100-01-31 00:00:00,2100-01-31,32869,,0,0,31,,0,0,10,2,,308182,19073183,, +5,2,19073183,2100-02-01,2100-02-01 12:00:00,2100-02-28,2100-02-28 00:00:00,2100-02-28,32869,,0,0,28,,0,0,10,2,,308182,19073183,, +6,2,19019979,2100-01-01,2100-01-01 12:00:00,2100-01-31,2100-01-31 00:00:00,2100-01-31,32869,,0,0,31,,0,0,10,1,,198405,19019979,, +7,3,19073183,2100-01-01,2100-01-01 12:00:00,2100-01-31,2100-01-31 00:00:00,2100-01-31,32869,,0,0,31,,0,0,10,3,,308182,19073183,, +8,3,19073183,2100-02-01,2100-02-01 12:00:00,2100-02-28,2100-02-28 00:00:00,2100-02-28,32869,,0,0,28,,0,0,10,3,,308182,19073183,, +9,3,19019979,2100-01-01,2100-01-01 12:00:00,2100-01-31,2100-01-31 00:00:00,2100-01-31,32869,,0,0,31,,0,0,10,1,,198405,19019979,, diff --git a/tests/test_io/test_omop.py b/tests/test_io/test_omop.py index 01c995a..41a99ba 100644 --- a/tests/test_io/test_omop.py +++ b/tests/test_io/test_omop.py @@ -14,6 +14,7 @@ "measurement": 2, "observation": 2, "specimen": 2, + "drug_exposure": 2, } # constants for setup_variables @@ -136,6 +137,56 @@ def test_setup_variables( ) +@pytest.mark.parametrize( + "observation_table", + [ + "person_cohort", + ], # "person_observation_period", "person_visit_occurrence"], +) +@pytest.mark.parametrize( + "data_tables,data_field_to_keep", + [ + (["drug_exposure"], ["days_supply"]), # ["one-hot"] + ], +) +@pytest.mark.parametrize( + "enrich_var_with_feature_info", + [False], # True, +) +@pytest.mark.parametrize( + "keep_date", + ["start", "end"], # "interval" +) +def test_setup_interval_variables( + omop_connection_vanilla, + observation_table, + data_tables, + data_field_to_keep, + enrich_var_with_feature_info, + keep_date, +): + num_intervals = 4 + con = omop_connection_vanilla + edata = ed.io.omop.setup_obs(backend_handle=con, observation_table=observation_table) + edata = ed.io.omop.setup_interval_variables( + edata, + backend_handle=con, + data_tables=data_tables, + data_field_to_keep=data_field_to_keep, + interval_length_number=1, + interval_length_unit="day", + num_intervals=num_intervals, + enrich_var_with_feature_info=enrich_var_with_feature_info, + keep_date=keep_date, + ) + + assert isinstance(edata, ed.EHRData) + assert edata.n_obs == VANILLA_PERSONS_WITH_OBSERVATION_TABLE_ENTRY[observation_table] + assert edata.n_vars == VANILLA_NUM_CONCEPTS[data_tables[0]] + assert edata.r.shape[2] == num_intervals + assert edata.var.shape[1] == VAR_DIM_BASE + (VAR_DIM_FEATURE_INFO if enrich_var_with_feature_info else 0) + + @pytest.mark.parametrize( "edata, backend_handle, data_tables, data_field_to_keep, interval_length_number, interval_length_unit, num_intervals, enrich_var_with_feature_info, enrich_var_with_unit_info, expected_error", [ From afad70261185cff9d6347b7eba36350d000b6181 Mon Sep 17 00:00:00 2001 From: Eljas Roellin Date: Wed, 20 Nov 2024 11:00:52 +0100 Subject: [PATCH 30/43] pypots minimal demo --- .../tutorial_time_series_with_pypots.ipynb | 274 ++++++++++++++++++ 1 file changed, 274 insertions(+) create mode 100644 docs/notebooks/tutorial_time_series_with_pypots.ipynb diff --git a/docs/notebooks/tutorial_time_series_with_pypots.ipynb b/docs/notebooks/tutorial_time_series_with_pypots.ipynb new file mode 100644 index 0000000..3fbe3d2 --- /dev/null +++ b/docs/notebooks/tutorial_time_series_with_pypots.ipynb @@ -0,0 +1,274 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Time Series Analysis with ehrdata and PyPOTS\n", + "ehrdata supports the extraction of data from the OMOP Common Data Model, as well as prepared datasets such as the Physionet 2012 Challenge.\n", + "\n", + "Once data is in the ehrdata format, ehrapy can operate on the data with\n", + "- exploratory data analysis\n", + "- utility functions for time series (ep.timeseries coming soon)\n", + "- ...\n", + "\n", + "From ehrdata, also fast deep-learning based time series analysis can be done using e.g. [PyPOTS](https://github.com/WenjieDu/PyPOTS).\n", + "PyPOTS is a Python toolkit/library for reality-centric machine/deep learning and data mining on partially-observed time series, including SOTA neural network models for scientific analysis tasks of imputation/classification/clustering/forecasting/anomaly detection/cleaning on incomplete industrial (irregularly-sampled) multivariate TS with NaN missing values." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example case: From any dataset in OMOP CDM 5.4 to applying DL for ..." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load Libraries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#! pip install pypots" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [], + "source": [ + "import duckdb\n", + "import ehrdata as ed\n", + "import pypots\n", + "import ehrapy as ep" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load and extract data" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO - Downloading Synthea27Nj_5.4.zip from https://github.com/OHDSI/EunomiaDatasets/raw/main/datasets/Synthea27Nj/Synthea27Nj_5.4.zip to /var/folders/yy/60ln_681745_fjjwvgwm_nyc0000gn/T/tmpfndmdvwt/Synthea27Nj_5.4.zip\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "254776f379994eeab1835ffe42fe89a1", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO - Extracted archive Synthea27Nj_5.4.zip from /var/folders/yy/60ln_681745_fjjwvgwm_nyc0000gn/T/tmpfndmdvwt/Synthea27Nj_5.4.zip to ehrapy_data/Synthea27Nj_5.4/Synthea27Nj_5.4\n",
+      "INFO - missing tables: []\n",
+      "INFO - unused files: ['EPISODE.csv', '__MACOSX', 'EPISODE_EVENT.csv']\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "multiple units for features: []\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/eljas.roellin/Documents/ehrapy_workspace/ehrapy_venv_oct/lib/python3.11/site-packages/anndata/_core/aligned_df.py:68: ImplicitModificationWarning: Transforming to str index.\n",
+      "  warnings.warn(\"Transforming to str index.\", ImplicitModificationWarning)\n",
+      "/Users/eljas.roellin/Documents/ehrapy_workspace/ehrapy_venv_oct/lib/python3.11/site-packages/pandas/core/generic.py:3331: UserWarning: Converting non-nanosecond precision datetime values to nanosecond precision. This behavior can eventually be relaxed in xarray, as it is an artifact from pandas which is now beginning to support non-nanosecond precision values. This warning is caused by passing non-nanosecond np.datetime64 or np.timedelta64 values to the DataArray or Variable constructor; it can be silenced by converting the values to nanosecond precision ahead of time.\n",
+      "  return xarray.Dataset.from_dataframe(self)\n",
+      "/Users/eljas.roellin/Documents/ehrapy_workspace/ehrapy_venv_oct/lib/python3.11/site-packages/pandas/core/generic.py:3331: UserWarning: Converting non-nanosecond precision datetime values to nanosecond precision. This behavior can eventually be relaxed in xarray, as it is an artifact from pandas which is now beginning to support non-nanosecond precision values. This warning is caused by passing non-nanosecond np.datetime64 or np.timedelta64 values to the DataArray or Variable constructor; it can be silenced by converting the values to nanosecond precision ahead of time.\n",
+      "  return xarray.Dataset.from_dataframe(self)\n",
+      "/Users/eljas.roellin/Documents/ehrapy_workspace/ehrapy_venv_oct/lib/python3.11/site-packages/pandas/core/generic.py:3331: UserWarning: Converting non-nanosecond precision datetime values to nanosecond precision. This behavior can eventually be relaxed in xarray, as it is an artifact from pandas which is now beginning to support non-nanosecond precision values. This warning is caused by passing non-nanosecond np.datetime64 or np.timedelta64 values to the DataArray or Variable constructor; it can be silenced by converting the values to nanosecond precision ahead of time.\n",
+      "  return xarray.Dataset.from_dataframe(self)\n",
+      "/Users/eljas.roellin/Documents/ehrapy_workspace/ehrapy_venv_oct/lib/python3.11/site-packages/pandas/core/generic.py:3331: UserWarning: Converting non-nanosecond precision datetime values to nanosecond precision. This behavior can eventually be relaxed in xarray, as it is an artifact from pandas which is now beginning to support non-nanosecond precision values. This warning is caused by passing non-nanosecond np.datetime64 or np.timedelta64 values to the DataArray or Variable constructor; it can be silenced by converting the values to nanosecond precision ahead of time.\n",
+      "  return xarray.Dataset.from_dataframe(self)\n",
+      "/Users/eljas.roellin/Documents/ehrapy_workspace/ehrapy_venv_oct/lib/python3.11/site-packages/anndata/_core/aligned_df.py:68: ImplicitModificationWarning: Transforming to str index.\n",
+      "  warnings.warn(\"Transforming to str index.\", ImplicitModificationWarning)\n"
+     ]
+    }
+   ],
+   "source": [
+    "con_gi = duckdb.connect(database=\":memory:\", read_only=False)\n",
+    "ed.dt.synthea27nj_omop(\n",
+    "    con_gi,\n",
+    ")\n",
+    "edata = ed.io.omop.setup_obs(\n",
+    "    con_gi,\n",
+    "    observation_table=\"person_observation_period\",\n",
+    ")\n",
+    "edata = ed.io.omop.setup_variables(\n",
+    "    edata=edata,\n",
+    "    backend_handle=con_gi,\n",
+    "    data_tables=[\"measurement\"],\n",
+    "    data_field_to_keep=[\"value_as_number\"],\n",
+    "    interval_length_number=20,\n",
+    "    interval_length_unit=\"day\",\n",
+    "    num_intervals=10,\n",
+    "    concept_ids=\"all\",\n",
+    "    aggregation_strategy=\"last\",\n",
+    "    enrich_var_with_feature_info=True,\n",
+    "    enrich_var_with_unit_info=True,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 70,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "EHRData object with n_obs x n_var = 28 x 132, and a timeseries of 10 steps.\n",
+       "             shape of .X: (0, 0) \n",
+       "             shape of .r: (28, 132, 10) "
+      ]
+     },
+     "execution_count": 70,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "edata"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Fit Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "saits = pypots.imputation.saits.SAITS(\n",
+    "    n_steps=10,\n",
+    "    n_features=edata.shape[1],\n",
+    "    n_layers=1,\n",
+    "    d_model=10,\n",
+    "    n_heads=2,\n",
+    "    d_k=10,\n",
+    "    d_v=10,\n",
+    "    d_ffn=10,\n",
+    ")\n",
+    "\n",
+    "saits.fit({\"X\": edata.r.transpose(0, 2, 1)})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 63,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "predictions = saits.predict({\"X\": edata.r.transpose(0, 2, 1)}, return_latent_vars=True)\n",
+    "edata.obsm[\"saits_latent\"] = predictions[\"latent_vars\"][\"combining_weights\"][:, :, -1]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Show 2D representation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 69,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "... storing 'concept_id_unit' as categorical\n",
+      "... storing 'concept_id_unit' as categorical\n"
+     ]
+    },
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAABPMAAAGvCAYAAAA+FJCFAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACQWElEQVR4nOzdd3hUVf7H8c/MpFcCBJJQAoQeQIrggiAgCkEEsaOugl3QVURQ1EVkEREL6g9d7IAu9oKKBRABRRSkN+kl9IQAKaTP3N8fLLOOJDAzmZbk/Xqe+6z33nPP+c4IO8fvPcVkGIYhAAAAAAAAAAHP7O8AAAAAAAAAADiHZB4AAAAAAABQSZDMAwAAAAAAACoJknkAAAAAAABAJUEyDwAAAAAAAKgkSOYBAAAAAAAAlQTJPAAAAAAAAKCSIJkHAAAAAAAAVBIk8wAAAAAAAIBKgmQeUEksXrxYJpNJixcv9ncoKMewYcPUqFGjc5bbs2ePTCaTZs6c6fWYAAAAqhKTyaQnn3zS32EAgF+RzAOAP/n222/pIAIAAAAAAhbJPAD4k2+//VYTJkxw69k333xTW7du9XBEAAAAAAD8D8k8oJo6efKkv0OocoKDgxUaGurvMAAAqHbo15wb3xEAVB0k84CzWLx4sc4//3yFhYUpJSVFr7/+up588kmZTCaHcv/5z3/UqVMnhYeHq2bNmhoyZIj27dvnUKZXr15q06aNNm/erN69eysiIkL16tXTs88+e0a7+/fv1+DBgxUZGak6derowQcfVFFRUZkxLl++XGlpaYqNjVVERIR69uypX375xaHM6Zg3b96sG2+8UXFxcerevbvT30NhYaGefPJJNW/eXGFhYUpMTNRVV12lnTt32sucPHlSDz30kBo0aKDQ0FC1aNFCzz//vAzDcKjLZDLpvvvu05w5c9SmTRuFhoYqNTVV33///RntHjhwQLfffruSkpIUGhqqxo0ba/jw4SouLraXOXHihEaOHGlvt2nTppoyZYpsNpu9zOk16p5//nm9+OKLSk5OVnh4uHr27KmNGzfayw0bNkyvvvqqPc7Th7PKWjPvxIkTGjZsmGJjY1WjRg0NHTpUJ06ccLpOAADgqLx+zfr16zVs2DA1adJEYWFhSkhI0G233aasrKwz6vBUH8MZ27dv19VXX62EhASFhYWpfv36GjJkiLKzs+1lSktLNXHiRKWkpCg0NFSNGjXSY489dkb/r7z14ho1aqRhw4bZz2fOnCmTyaQlS5ZoxIgRqlOnjurXr2+//91336lnz56Kjo5WTEyMOnfurPfff9+hTmf6mGdTUlKimjVr6tZbbz3jXk5OjsLCwjR69GhJUnFxsZ544gl16tRJsbGxioyMVI8ePbRo0aJztlPemsVl9dkl5/rtABDogvwdABCo1qxZo7S0NCUmJmrChAmyWq3617/+pfj4eIdykyZN0rhx43TdddfpjjvuUGZmpqZNm6aLLrpIa9asUY0aNexljx8/rrS0NF111VW67rrr9Omnn+qRRx5R27Zt1b9/f0lSQUGB+vTpo/T0dN1///1KSkrSe++9px9//PGMGH/88Uf1799fnTp10vjx42U2mzVjxgxdfPHF+vnnn9WlSxeH8tdee62aNWump59++owkW3msVqsuv/xyLVy4UEOGDNEDDzyg3NxcLViwQBs3blRKSooMw9CgQYO0aNEi3X777Wrfvr3mzZunMWPG6MCBA3rxxRcd6ly6dKk+//xzjRgxQtHR0fq///s/XX311UpPT1etWrUkSQcPHlSXLl104sQJ3XXXXWrZsqUOHDigTz/9VPn5+QoJCVF+fr569uypAwcO6O6771bDhg21bNkyPfroozp06JBeeuklh3bfffdd5ebm6t5771VhYaFefvllXXzxxdqwYYPq1q2ru+++WwcPHtSCBQv03nvvOfX9nI1hGLriiiu0dOlS3XPPPWrVqpW++OILDR06tMJ1AwBQ3f21X7NgwQLt2rVLt956qxISErRp0ya98cYb2rRpk3777Td7YscbfYzyFBcXq1+/fioqKtI//vEPJSQk6MCBA5o7d65OnDih2NhYSdIdd9yhWbNm6ZprrtFDDz2k5cuXa/Lkyfrjjz/0xRdfuP0djRgxQvHx8XriiSfsI/Nmzpyp2267TampqXr00UdVo0YNrVmzRt9//71uvPFGSa73McsSHBysK6+8Up9//rlef/11hYSE2O/NmTNHRUVFGjJkiKRTyb233npLN9xwg+68807l5ubq7bffVr9+/bRixQq1b9/e7e/gz1zptwNAQDMAlGngwIFGRESEceDAAfu17du3G0FBQcbpvzp79uwxLBaLMWnSJIdnN2zYYAQFBTlc79mzpyHJePfdd+3XioqKjISEBOPqq6+2X3vppZcMScbHH39sv3by5EmjadOmhiRj0aJFhmEYhs1mM5o1a2b069fPsNls9rL5+flG48aNjUsvvdR+bfz48YYk44YbbnD5e3jnnXcMScbUqVPPuHe63Tlz5hiSjKeeesrh/jXXXGOYTCZjx44d9muSjJCQEIdr69atMyQZ06ZNs1+75ZZbDLPZbPz+++/ltjtx4kQjMjLS2LZtm8P9sWPHGhaLxUhPTzcMwzB2795tSDLCw8ON/fv328stX77ckGQ8+OCD9mv33nuv4e7/NQ4dOtRITk62n5/+Xp599ln7tdLSUqNHjx6GJGPGjBlutQMAQHVWXr8mPz//jLIffPCBIcn46aef7Nc82cc4lzVr1hiSjE8++aTcMmvXrjUkGXfccYfD9dGjRxuSjB9//NF+TZIxfvz4M+pITk42hg4daj+fMWOGIcno3r27UVpaar9+4sQJIzo62rjggguMgoIChzpOf3ZX+pjnMm/ePEOS8fXXXztcv+yyy4wmTZrYz0tLS42ioiKHMsePHzfq1q1r3HbbbQ7X//od/LX/ddrpPyenudJvB4BAxzRboAxWq1U//PCDBg8erKSkJPv1pk2b2kfQSdLnn38um82m6667TkePHrUfCQkJatas2RlTA6KiovT3v//dfh4SEqIuXbpo165d9mvffvutEhMTdc0119ivRURE6K677nKoa+3atdq+fbtuvPFGZWVl2ds+efKk+vTpo59++umMaSD33HOPy9/FZ599ptq1a+sf//jHGfdOv+H+9ttvZbFYdP/99zvcf+ihh2QYhr777juH65dccolSUlLs5+3atVNMTIz9e7DZbJozZ44GDhyo888/v9x2P/nkE/Xo0UNxcXEO3/8ll1wiq9Wqn376yeG5wYMHq169evbzLl266IILLtC3337rylfitG+//VZBQUEaPny4/ZrFYinzuwQAAK75a78mPDzc/s+FhYU6evSo/va3v0mSVq9eLcl7fYzynB55N2/ePOXn55dZ5nQ/ZNSoUQ7XH3roIUnSN99841RbZbnzzjtlsVjs5wsWLFBubq7Gjh2rsLAwh7KnP7s7fczyXHzxxapdu7Y++ugj+7Xjx49rwYIFuv766+3XLBaLfeSezWbTsWPHVFpaqvPPP9/+766iXO23A0AgY5otUIaMjAwVFBSoadOmZ9z787Xt27fLMAw1a9aszHqCg4MdzuvXr3/G2h1xcXFav369/Xzv3r1q2rTpGeVatGjhcL59+3ZJOuuUzezsbMXFxdnPGzduXG7Z8uzcuVMtWrRQUFD5/3exd+9eJSUlKTo62uF6q1at7Pf/rGHDhmfUERcXp+PHj0uSMjMzlZOTozZt2pw1tu3bt2v9+vVnTH0+LSMjw+G8rH9PzZs318cff3zWdty1d+9eJSYmKioqyuH6X/9dAgAA1/21X3Ps2DFNmDBBH3744Rl9gNPr03mrj3G2GEeNGqWpU6dq9uzZ6tGjhwYNGqS///3v9kTf3r17ZTabz+h3JiQkqEaNGmf0o1zx1+/o9HrHZ/v87vQxyxMUFKSrr75a77//voqKihQaGqrPP/9cJSUlDsk8SZo1a5ZeeOEFbdmyRSUlJeV+Bne52m8HgEBGMg+oAJvNJpPJpO+++87hredpf03ilFVGktPr1/21bUl67rnnyl1H5K/t//mNtT956nuw2Wy69NJL9fDDD5d5v3nz5i7HBgAAKoe/9muuu+46LVu2TGPGjFH79u0VFRUlm82mtLQ0lzet8GQf44UXXtCwYcP05Zdfav78+br//vs1efJk/fbbbw6bUriy6dZfWa3WMq+70/dzp495NkOGDNHrr7+u7777ToMHD9bHH3+sli1b6rzzzrOX+c9//qNhw4Zp8ODBGjNmjOrUqSOLxaLJkyc7bLhWlvK+t79+J6722wEgkJHMA8pQp04dhYWFaceOHWfc+/O105s/NG7c2GOJo+TkZG3cuFGGYTh0TrZu3epQ7vQ01ZiYGF1yySUeabssKSkpWr58uUpKSsp9Y5mcnKwffvhBubm5DqPztmzZYr/vivj4eMXExDjsNFtebHl5eU5//tNvmv9s27ZtDjugVaQj/VfJyclauHCh8vLyHDqIf/13CQAAKub48eNauHChJkyYoCeeeMJ+/a+//d7qY5xL27Zt1bZtW/3zn//UsmXLdOGFF+q1117TU089peTkZNlsNm3fvt0+q0GSjhw5ohMnTjj0o+Li4nTixAmHuouLi3Xo0CGn4jjdf9y4cWOZM1D+XMZTfcyLLrpIiYmJ+uijj9S9e3f9+OOPevzxxx3KfPrpp2rSpIk+//xzh77Y+PHjz1l/Wd+JdObMEG/02wHAX1gzDyiDxWLRJZdcojlz5ujgwYP26zt27HBY/+2qq66SxWLRhAkTzhhVZhiGsrKyXG77sssu08GDB/Xpp5/ar+Xn5+uNN95wKNepUyelpKTo+eefV15e3hn1ZGZmutx2Wa6++modPXpUr7zyyhn3Tn/myy67TFar9YwyL774okwmk8M6g84wm80aPHiwvv76a61cubLcdq+77jr9+uuvmjdv3hllTpw4odLSUodrc+bM0YEDB+znK1as0PLlyx3ii4yMtD9fUZdddplKS0s1ffp0+zWr1app06ZVuG4AAPA/p0da/bU/9tddZ73VxyhPTk7OGWXbtm0rs9msoqIiSaf6C2XFOnXqVEnSgAED7NdSUlLOWK/vjTfeKHdk3l/17dtX0dHRmjx5sgoLCx3unf7snu5jms1mXXPNNfr666/13nvvqbS09IwptmX9+1u+fLl+/fXXc9afkpKi7Oxsh2VrDh06dMYuwN7otwOAvzAyDyjHk08+qfnz5+vCCy/U8OHD7cmqNm3aaO3atZJOdR6eeuopPfroo9qzZ48GDx6s6Oho7d69W1988YXuuusujR492qV277zzTr3yyiu65ZZbtGrVKiUmJuq9995TRESEQzmz2ay33npL/fv3V2pqqm699VbVq1dPBw4c0KJFixQTE6Ovv/66wt/DLbfconfffVejRo3SihUr1KNHD508eVI//PCDRowYoSuuuEIDBw5U79699fjjj2vPnj0677zzNH/+fH355ZcaOXKkw2YXznr66ac1f/589ezZU3fddZdatWqlQ4cO6ZNPPtHSpUtVo0YNjRkzRl999ZUuv/xyDRs2TJ06ddLJkye1YcMGffrpp9qzZ49q165tr7Np06bq3r27hg8frqKiIr300kuqVauWwxSaTp06SZLuv/9+9evXTxaLRUOGDHHruxs4cKAuvPBCjR07Vnv27FHr1q31+eef29ftAQAAnhETE6OLLrpIzz77rEpKSlSvXj3Nnz9fu3fvPqOsN/oY5fnxxx9133336dprr1Xz5s1VWlqq9957TxaLRVdffbUk6bzzztPQoUP1xhtv6MSJE+rZs6dWrFihWbNmafDgwerdu7e9vjvuuEP33HOPrr76al166aVat26d5s2b51Qsp7+nF198UXfccYc6d+6sG2+8UXFxcVq3bp3y8/M1a9Ysr/Qxr7/+ek2bNk3jx49X27ZtHUYgStLll1+uzz//XFdeeaUGDBig3bt367XXXlPr1q3LTCj+2ZAhQ/TII4/oyiuv1P3336/8/HxNnz5dzZs3d9g8wxv9dgDwG19vnwtUJgsXLjQ6dOhghISEGCkpKcZbb71lPPTQQ0ZYWJhDuc8++8zo3r27ERkZaURGRhotW7Y07r33XmPr1q32Mj179jRSU1PPaGPo0KFGcnKyw7W9e/cagwYNMiIiIozatWsbDzzwgPH9998bkoxFixY5lF2zZo1x1VVXGbVq1TJCQ0ON5ORk47rrrjMWLlxoLzN+/HhDkpGZmenW95Cfn288/vjjRuPGjY3g4GAjISHBuOaaa4ydO3fay+Tm5hoPPvigkZSUZAQHBxvNmjUznnvuOcNmsznUJcm49957z2gjOTnZGDp06Bnfwy233GLEx8cboaGhRpMmTYx7773XKCoqcmj30UcfNZo2bWqEhIQYtWvXNrp162Y8//zzRnFxsWEYhrF7925DkvHcc88ZL7zwgtGgQQMjNDTU6NGjh7Fu3TqHNktLS41//OMfRnx8vGEymQxX/m+yrH+XWVlZxs0332zExMQYsbGxxs0332ysWbPGkGTMmDHD6boBAMAp5fVr9u/fb1x55ZVGjRo1jNjYWOPaa681Dh48aEgyxo8f71DWU32Mc9m1a5dx2223GSkpKUZYWJhRs2ZNo3fv3sYPP/zgUK6kpMSYMGGCva/VoEED49FHHzUKCwsdylmtVuORRx4xateubURERBj9+vUzduzYcUY/asaMGYYk4/fffy8zrq+++sro1q2bER4ebsTExBhdunQxPvjgA4cyzvQxnWWz2YwGDRoYkoynnnqqzPtPP/20kZycbISGhhodOnQw5s6dW2bfqqx/n/PnzzfatGljhISEGC1atDD+85//2P+c/JUz/XYACHQmw3Bj5X2gGhs8eLA2bdpU5vprCEx79uxR48aN9dxzz/HGFQAAAABQqbFmHnAWBQUFDufbt2/Xt99+q169evknIAAAAAAAUK2xZh5wFk2aNNGwYcPUpEkT7d27V9OnT1dISIjDGmuVVXFxsY4dO3bWMrGxsQoPD/dRRIHr2LFjKi4uLve+xWJRfHy8DyMCAAD+Vp37B1ar9ZwbYURFRSkqKspHEQFA9UIyDziLtLQ0ffDBBzp8+LBCQ0PVtWtXPf3002rWrJm/Q6uwZcuWOSyoXJYZM2Zo2LBhvgkogF111VVasmRJufeTk5O1Z88e3wUEAAD8rjr3D/bt26fGjRuftcz48eP15JNP+iYgAKhmWDMPqKaOHz+uVatWnbVMamqqEhMTfRRR4Fq1apWOHz9e7v3w8HBdeOGFPowIAAD4W3XuHxQWFmrp0qVnLdOkSRM1adLERxEBQPVCMg8AAAAAAACoJNgAAwAAAAAAAKgk/LJmns1m08GDBxUdHS2TyeSPEAAAwDkYhqHc3FwlJSXJbHZ8/1dYWHjWhd//KiQkRGFhYZ4OER5C3wwAgMrhbP0zVB9+SeYdPHhQDRo08EfTAADARfv27VP9+vXt54WFhWqcHKXDGVan60hISNDu3btJ6AUo+mYAAFQuf+2foXrxSzIvOjpa0qk/fDExMf4IAQCAs9q+aqf+OegZFeYV+TWO8/udp0dnP6CgYN//ZOfk5KhBgwb23+3TiouLdTjDqt2rkhUTfe43wjm5NjXutFfFxcUk8wIUfTMAQGWw6ecD+uWzHf4OQ10ub6z2lzT0S9vl9c9QvfglmXd6+kZMTAwdRgBAwDl+5IQmXz9NpSdtCjIF+zWWtfM3a/b4z/WPV+7wWwzlTbuMiTY7lcxD4KNvBgAIdOmbs7R67iGFh0T6OxRtmJ+heo3rKKVDHb/FwLIY1Rs9cAAA/uLl4W8o+2iuv8Ow+3r6fK1dtNHfYZzBaticPgAAANxVXFCqRe9tkQx/R/I/S97fqoI859cPBjyJZB4AAH/y4/s/65c5v/s7DAeGYeiF2/+tgrwCf4fiwCbD6QMAAMBdSz/drrzj/l365K8Kckv00wfb/B0GqimSeQAA/NfxIyf06gMz/B1GmQ7vydSbj8z2dxgAAAA+lb45S3/8csjfYZRpx6oM7Vyd4e8wUA35Zc08AAACiWHLk0o2asvidzVi4kY1a5uvmnVLFRJqk8kklRSblXvCop2bwrV9fbi2r4vQtnXhOp7p2/X0vnljgYY8coXqNIz3abvlsckmZybQOlcKAADgT3IOSQfX6Lf3SiVF+Tuacv325S6ldPTf2nmonkjmAQCqLaN4lYz896XC7yWV6IKeZZezBNkUFmFTfFKJ/nZpjiTJZpPW/RKlr2fW0q/zYmWzeX8RYpvVpq9fW6Dbn77R6205w2oYshrnnkLrTBkAAABZS6Q/vpZ+f1vau1RHipsp8/iz/o7qrE4cyde+LcfUoGVNf4eCaoRkHgCgWjEMQyr4XEb+LKl0i9v1mM1Shx556tAjT5kHg/Xtf2rps9drq6jAUqH4YmqWqnm7fDVrV6AmrQsUEWNVSKghm9Wk4iKTcrNfU2mOTZbw86SgFjKZQirUHgAAgN8V5UrLpkmrZkp5R+yXN+an+S8mFyyZvVVdBjZWfMNo1agbwU6z8DqSeQCAasMoTZeR/ahU4tkNLuKTSjT04cO69LpjmjqqgTb85tpUkLr1izXglqPqecUJJTQoOUfpXCn/XzLyJSlYRnBbmcKvk8IHyGQKdfcjuMXZzS3YAAMAAJRr54/SV/dL2fscLhfaorS98EI/BeWa7MwCLXhnsyQpJMyi+q1qqk2PeqrfKo7EHryCZB4AoMozDEPKf09G3guS4b0dYZMaFevZT3fq65m19PakxHOO0ut8cY4GDjuq83vnyuLWgL4SqWS1jJLVUu4zMsKvlCniJpmCGroVv6tsMmQlmQcAANxRlCvNe1xaPavM2+lF7WWVb19UekJxoVW71mRq15pM1agbodQeSWp1YZJCw0m/wHP40wQAqNIMo1jG8buk4mU+ac9slq64LUvtL8zTozekKOvwmZtkxCcVa+Tz+3R+rzzPNWyckPJnyMj/jxR1nxR5l0ymik35PRdG5gEAALcc2y3NulzK3l9ukYySpj4MyDtOHMnXL5/u0JoF6ep1U0s1blfb3yGhiiCZBwCocozi32UU/SyVrJOKf5dU6vMYklsUaeqcHXr42iY6su9/b5X735ilO584qMgYb+3wWiIj70WpcIEUO0Wm4GZeagcAAMBJpcXS1m+kvcukvb9KRzZK53jZl1mS4pvYfCA/u1jf/nu9ml9QVz2ua66wyDNf9gKuIJkHAKgSDFueVDBHRsEHUul2f4cjSUpoWKwpH+/SqMFNdTLHrMdeS7fvhut1pRtlZA2WYsbJFDHEK02wmy0AADirE+nSynekNf+RTmY6/ZhhSEdLG3sxMP/YtvyI9m85rsvuaae6jWP8HQ4qMZJ5AIBKzTCKZeS9KuW/Kxkn/R3OGRKTi/X0+7tUkG9S607eW6+vbCUycp6QbDkyRd3l8dpt/z2cKQcAAKqR3MPSd49If3wlGa73BPJstVVsRHohMP/Lzy7Wly+t0WUj2ql+izh/h4NKyuzvAAAAcJdRsl5G1pXSyekBmcg7rXGrQj8k8v7HyHtexsm3/dY+AACoRtZ+IL16gbR5jluJPEkqtoV7NqYAU1Jk1TevrtPBHSf8HQoqKZJ5AIBKxzBKZct9QUbW9QEzpTbQGblTZBR86dE6rf/dzdaZAwAAVHF5GdL710tz7pEKT1SoKpuq/ppypcU2ffPqeh07FLgvpBG4SOYBACoVwyiSceIf0snXJVn9HU6lYuQ8KcN60GP1WQ3nDwAAUIUd3yu93Vfa9r1HqjP7YfMyfyguKNXCmZtls9FZgmtI5gEAKg3DKJZx/G6paKG/Q6mcjJMysh/3dxQAAKAqOb5HmtFfOr7bY1VaTMUeqyvQZezN1Zr5e/0dBioZknkAgErBMEplnHhAKl7m71Aqt+JfZOR/6JGqbC4cAACgCso9LL17hZRzwKPVRluOyqwSj9YZyFbM3a2sg3n+DgOVCMk8AEClYOT9mxF5HmLkPivDVvEOo00mWZ04bDJ5IGoAABBQDEP69PZTI/M8zGIqVc2gfR6vN1DZSg0t/Zh1oOE8knkAgIBnlGz+7xp58AgjTyqY4+8oAABAZbbiTWnvUq9VHx+802t1B6L9W47r+GE2w4BzSOYBAAKaYZTIyB4rVaOpFr5gFHxQ4TpshvMHAACoQo7tln540qtN1KlmyTxJ2rDEs9OVUXWRzAMABLaTb0qlW/wdRdVTul1G8YoKVeHMFNvTBwAAqEK++odU4t1RZPVCNni1/kC09bfDKimy+jsMVAIk8wAAAcswimScnOnvMKoso4JTbUnmAQBQDe1fKe352evNxAUdrHYJveKCUu1el+nvMFAJkMwDAASuwm8k44S/o6i6Stb7O4IyWa1WjRs3To0bN1Z4eLhSUlI0ceJEGcb/5usOGzZMJpPJ4UhLS3OoZ9CgQWrYsKHCwsKUmJiom2++WQcPHnQos379evXo0UNhYWFq0KCBnn32WZ98RgAAKq3f3/JZU20ivvdZW4HiyJ4cf4eASiDI3wEAAFAeI/99f4dQtZXulGEUyGQKd+txm2GSzTj3qDtnyvzZlClTNH36dM2aNUupqalauXKlbr31VsXGxur++++3l0tLS9OMGTPs56GhoQ719O7dW4899pgSExN14MABjR49Wtdcc42WLVsmScrJyVHfvn11ySWX6LXXXtOGDRt02223qUaNGrrrrrtcihkAgGoh/5i08XOfNdck9DdFmI8p31bTZ236W+beXH+HgEqAZB4AICAZJZsDduRY1WGVSrZIIR3cfNq5KbSuTrNdtmyZrrjiCg0YMECS1KhRI33wwQdascJxjb/Q0FAlJCSUW8+DDz5o/+fk5GSNHTtWgwcPVklJiYKDgzV79mwVFxfrnXfeUUhIiFJTU7V27VpNnTqVZB4AAGVZ8x/JWuSz5swmm9pHfqllubf6rE1/y9yfJ8NmyGRmmRKUj2m2AIDAVLzM3xFUDyWbfNZUTk6Ow1FUVPZ/DHTr1k0LFy7Utm3bJEnr1q3T0qVL1b9/f4dyixcvVp06ddSiRQsNHz5cWVlZ5bZ97NgxzZ49W926dVNwcLAk6ddff9VFF12kkJAQe7l+/fpp69atOn78eEU/LgAAVc/uJT5v8ryIuaoTvM3n7fpLaZFVx4/k+zsMBDiSeQCAgGSUbPR3CNVDBdYktMrs9CFJDRo0UGxsrP2YPHlymfWOHTtWQ4YMUcuWLRUcHKwOHTpo5MiRuummm+xl0tLS9O6772rhwoWaMmWKlixZov79+8tqddwB7pFHHlFkZKRq1aql9PR0ffnll/Z7hw8fVt26dR3Knz4/fPiw298LAABV1sG1Pm/SbLKpT+w0WVTs87b9pfBkib9DQIBjmi0AIDD5cMRYdWYYRW7vNWs4uWae8d8y+/btU0xMjP36X9e4O+3jjz/W7Nmz9f7779unvo4cOVJJSUkaOnSoJGnIkCH28m3btlW7du2UkpKixYsXq0+fPvZ7Y8aM0e233669e/dqwoQJuuWWWzR37lyZTExdAQDAJdn7pfyjfmm6ZtB+JQRv0YGSdn5p39esJTZ/h4AARzIPABBwDFuuZE33dxjwsJiYGIdkXnnGjBljH50nnUrW7d27V5MnT7Yn8/6qSZMmql27tnbs2OGQzKtdu7Zq166t5s2bq1WrVmrQoIF+++03de3aVQkJCTpy5IhDPafPz7YWHwAA1ZIfRuVJp14KLsm5q9ok8gBnMM0WABB4rPskGf6OolowmcLcfvb0BhjOHK7Iz8+X2ezYRbFYLLLZyn9LvX//fmVlZSkxMbHcMqefP71WX9euXfXTTz+ppOR/U1kWLFigFi1aKC4uzqWYAQCo8o7v9kuzS3Lu1qaCNL+07S9BwaRqcHb8CQEABB6jwN8RVBsZB9yfxmE1zE4frhg4cKAmTZqkb775Rnv27NEXX3yhqVOn6sorr5Qk5eXlacyYMfrtt9+0Z88eLVy4UFdccYWaNm2qfv36SZKWL1+uV155RWvXrtXevXv1448/6oYbblBKSoq6du0qSbrxxhsVEhKi22+/XZs2bdJHH32kl19+WaNGjXL7OwEAoMoq8X3/bHnuDdpU0M/n7fpbSUihv0NAgCOZBwAIQNZzF4FHPHf7Iq1b7N76hDaZZJPZicO1kXnTpk3TNddcoxEjRqhVq1YaPXq07r77bk2cOFHSqVF669ev16BBg9S8eXPdfvvt6tSpk37++Wf7OnwRERH6/PPP1adPH7Vo0UK333672rVrpyVLltjLxMbGav78+dq9e7c6deqkhx56SE888YTuuusut74PAACqNFupT5s7WNxKq05e7dM2A0GRpUD3r7pHR04eOXdhVFsmwzB8Po8pJydHsbGxys7OdmrtHABA9WIUr5Fx7Hp/h1HllZZIg5u3lckUqqfmPqoOF7d1uF/e7/Xp69+sb6LIaMs52zmZa9WAdrv43Q9g9M0AAOf003PSj0/5pKlSI0QfHn1R2dYkn7QXSA7GbNdXqa+oflR9zeo/S3Ui6jjc5zcbEiPzAACByBTh7wiqhb3bwlRSZFZxYYnGD35WW3/f4dLz3lozDwAABKCQKJ819VvuTdUykSdJmZH7JUn78/br7gV3K7so288RIRCRzAMABJ6gRmLDde/bsirS/s8FeYWaeN1UFeQ5vx6Ot9bMAwAAASi+hU+aySppoPX5A3zSViA6Er3H/s87TuzQhF8n+C8YBCx61wCAgGMyhUpBTf0dRpU3/2PHHVuP7M3UG2Pe81M0AAAgoCW290kzG/P7y9C5l/GoigqC8rQ3bqPDtQV7F2jennl+igiBimQeACAwBbfxdwRV2vb14dqyOvKM69+88YNWL9zgVB2nNsBw7gAAAJVcRE2pRkOvNlFsC9PWwp5ebSOQbY1fLqv5zI1Gnl7+tI4VHvNDRAhUJPMAAAHJFJzq7xCqtK9n1SrzumEYmnrHdBUXlZyzDpvMsjpx2OhuAABQNXh5dN7Wwl4qMarn2smGbNqU8EuZ944VHtOUFVN8HBECGb1rAEBgCu0tfqa8I/uYRYvnxJV7/8jeTP30ya8+jAgAAFQKLS7zavVbCnp7tf5All7jD+WGZZV7f96eeTp88rAPI0Ig47+SAAAByWRJUsZhRud5w/Rx9VRUcPYuwNfTz702CxtgAABQzaReqWJzrFeqthpBOlrS2Ct1B7oSc5F+afzZWctYDas+3vqxjyJCoKN3DQAISEf2Zuq1f557qidc88t3MVr0Rfmj8k7b/Os27Vy/56xlbP+dQuvMAQAAKr8d63O0KbeXV+rOKk2WTcFeqTvQLW84VzlnGZV32ufbP1eJjf4xSOYBAALU1Dun65dvQ3Rwd4i/Q6kyTuaYNW1sfafLz5+xyIvRAACAyqQgr1g/fbhVG/PTZBie39wqoyTF43VWBgejd2hjwk9Olc0qzNKSfUu8HBEqA5J5AICA880bC7T6hw2STHr3+QR/h1NlfPpavI5nOv/Ge+MvW85632qYnD4AAEDl9tOH21SQW6Ica4L+KOjj8fozS5p4vM5Alx+cq0VNZ0sudJXWZKzxXkCoNEjmAQACSmlJqd598n/rgSz6Ik7Lvo/xY0RVx9Y1ru0Od3D72RdZdmYn29MHAACovI7uz9WOlRn2819yhynXWsujbRTYvLMWX6AqsuTrm1bTlRt2zKXnth7f6qWIUJnQuwYABJSlny/XscMnHK793yP1lXPM4p+AqpCiQtd+9m024+z3DbPTBwAAqLw2LDngcF5sRGpx9giPtmE1qs/SKgVBeZrberqyIg+cu/Bf7Di+wwsRobKhdw0ACChflbGL6vHMYP17XD0/RFO15OeREAUAAK4pLijVthVHzrieXtxRm/Iv8UNElVt2WKa+Sv0/ZUalu/V8obXQwxGhMgrydwAAAJy2Z9M+bfjpjzLvLfoiTvVTivT3UWd2JnFuJcUm7dse6tE6nZ1Ca9XZR/gBAIDAteW3QyotspZ576ecuxRjyVCD0PUVbsdiKq5wHYHMkE0bEn7SioZzVWphR1pUDCPzAAABY9mXv5/1/nvPJ+jT1+J9FE3Vsm9niEqKPfuzb5Nzm2DYPNoqAADwpd3rjpZ7z6ZgfXviUR0sbl3hdsLNORWuI1Blh2Xqy9RpWtb4C8dEHu874SaSeQCAgLFt1c5zlnnzX0l697m6PoimaklqVKyu/bL9HQYAAKhkMtNzz3q/1AjTV8ee0N6iDhVqJz54V4WeD0RHovbox5T/6OPzntHhmDM/X+d9/RWXn+CHyFDZkcwDAASM7auc68TNfjFB44c1UtZhVotwVli4oSdn7NEjr+xVdI1Sj9Rpk9npAwAAVD7Zmfkqyj93v8GqUH1z/HH9lnuTrIZ7/bNCa7RbzwWa0MJjMhcs0xepz+mLti9qW53fZTWX/R2mZHXQ1etHq8OBS2QyTD6OFJUZ/xUEAAgI2UdzlJFe/jSOv/ptfqw2Lo/U8IkHdck1x70YWdVy8VUn1LrzST16fYoO7qnYGnpWwyyrEzvVOlMGAAAEnoy9Zx+V92eGLFp18hrtLuqsPrHTVCf43DMuTsu3xmrtyYHuhOg/hk3xmWtlsRUrIj9D0Xn7FJ2brpCSPElS5DGTXrjKLJu57CRdrZNJqlF4arbJBekDlZjdVPNbvM16enAKvWsAQEDYs2mfy8/kZQfpufsb6olbGmvL6ggvRFU1JTQo0fNf7FByc3ZDAwAA5Tt26KTrz5Qm69OsKVqaM0y5VufWOl6Sc4+KFONyW/4UnbdPbTe/rdZb3lOj9HmqdWyzPZEnSZ23G7p3rk0yyl4Yr/WRCx3OG2a30oA/RijY6tkNy1A1kcwDAASEwjz3E0vLf4jRA5c30+irm+jEUYsHo6q6atUt1eQPdyqhYZHbddhkcvoAAACVT0k5u9ieiyGL1uVfofcyp+v97Ee01NpWtnKmke4s/Jt2Ff2tImH6RcLh5ecs02OTob6rz0zmhZSGqXlm5zOuJ+Y2UdqWO2SxlT+JMj6czeDANFsAgI/lnTipbat2afuqXdq5brdyj+WppKhU2UcrtoNZWIRVtz92WDVqu9fprI5qJZTqqdm7NeLS5iouPPP9XnhMmHSWfy1MswUAoGrIPVaozPRcZabn6tjBkyopKpW11FB2ZkGF6s00mzQt+DwVlJynespUJ/M2tTXvVjvzLrU27VW0qUCrT17poU/hOxZrkRKdSOZJ0t8X2bQmxaTMGv9LZjY72lnBtrJH4NXLaa6Ldl2vRU1nl3m/Zc2W+kk/uR40qhSSeQAArzt68Ji+e3OhFr7/sw5sP+SFFgw9/vpeteqU74W6q7YGKUW6dewhvf5kvTPuNWnXSFrq+5gAAID3Ze7L1cbF+7V7/VEV5Hp+nbZ8k6FPoopU8N93egcUrwO2eH1lOz291FCT0iJdXRLn8ba9re6RFQqyOjerJKxEGv6tTf+6wSyZTAotjVDH/Zee9ZkWmV20q+Za7a256cx7NVu4FTOqFpJ5AACvWbd4k7589Tst+3KlrKXeGzF3+S1Z6tLH+QWa4eiK249q6bex2rQiyuF6t4Gd9epZknlWmWV1YsUOZ8oAAADvs1pt2vH7EW1YckBHdldsVsS5LAgvVt5ZuwAmNSuuhDvYGjbVP+DayLg2ew31WWdoYXuTLtx9lSJLYs/5zEW7rtdHMZNVHOQ4OrJ3/d4utY2qiWQeAMDjjmdka9q9b+rnz5ybflARdesX6/Z/emO0X/VhsUgPTd2n4Ze2UNF/X5+HR4Wp940XSo+U/5zNMJW7/s1fywEAAP/KTM/VwlmblXXA9U0tXLUluFTbQmxnLRNiSC2LK99ax/UP/KSokwddfm7gcpt2Nmir5kfPXCuvLJElseq++2r92Ow/9mvn1z1fKXEpLreNqodX5QAAj1r80S+6s82DPknkSYZGTd2niKizdxZxbvWaFKvv9cfs5xff2EORMWffIdj235F55zpsdDcAAPAba6lNy7/apU+fWemTRF6+DP0Qfu5pu4mlZoVUsk2ywgsylbJrjlvPJh2TrlzZ0aVnmh/trBr5de3n17e83q22UfXQuwYAeERJcYmeufn/NOmGl5R91DdTXrul5ah99zyftFUdXH5Llv2fB43o58dIAACAJ+QdL9Qnz6zUym/3yGY7c1dVbwiSFG89d6qhrhNlAophqOWW/8hic399wabpq11+JvVId0mndrHt07CP222jaqlkf3sAAIGoML9I4wZN0cLZP/u03UG3HvVpe1Vdo5aFavu3PPW5qYeatEs+Z3mbYXb6AAAAvnXiSL4+f261svb79sVniEy6+mSImpac/fc/obIl82QoJndPhWqodXSDQopOuPRM88zOCrKG6B8d/qFgc3CF2kfVUdn+9gAAAkxJcYnGD56iVfPX+bTd+imFOu9CRuV52tX35GjEy7c6VdYqk9MHAADwnZysAn350hrlHnNux1VPC5JJg06GqPFZEnp1rZWsf2AyKy+yfoWqMMumGtk7XXom1Bqu/qVDdGWzKyvUNqoWknkAgAp5dugrWv3DBp+3e/ktWTLzK+ZxXfrkKDru7GvlAQCAwFWUX6KvXl6rvONFfo3DIpOuOBmihNKyk3axtkqWzJNUGF6rwnVE5+5z+ZmO+T0r3C6qFv4zCADgtgXvLtHij5b5pe1eg0/4pd2qzmIpkkqde2PMNFsAAALP0o+3KzujwN9hSJKCZdJl+SGy/GW5PoshmSrhyH2bB6a5Ruemu/xMzgH/JmYReOhdAwDcknXouKY/OMMvbdepV6y4+FK/tF0tlG50qphVzk61BQAAvrBnw1Ft+e2wv8NwUMtmVvfCIIdrlTURYTNVPPLoPNeTeQW5JX6bMo3AVFn/DgEA/Oylu19X7vGTfmm7Wbt8v7RbXRglm/wdAgAAcFFRfokW/2eLv8Mo0/lFQUoq/V/6obK+kjXbKh55cGmBgkpc78tmpudWuG1UHSTzAAAuW/7NKv02d5Xf2m/WLjCmjlRZpdudKsY0WwAAAsfv3+7Ryexif4dRJrNMuqTgf1NUDZNUIuMsTwSmIKtnpruabSUuP3PsIBu/4X/oXQMAXDbnle/82n6TVJJ5XmU4N+LSapidPgAAgPeUFFu1Zdkhf4dxVnWtZtX70+i8o39dSK8SiDx50CP1mAyby8+UFLFwCf6H3jUAwCUHdx7Wqvnr/RpDdA06M15lBOZbfQAAULbtK46oKD/wJ6+2L7JIkmJsJkW5ns/yq6DSfIUXZHqkLps56NyF/sJaUvmSn/Ae1/8EAQCqta+nz5dh+LczERxCZ8a7nNupzZBJNid2ojMq4W51AABUJhuW7Pd3CE5pXmLRwUKbehQGK7SS9Q+icvd5LGKrJdTlZ8yWyvV9wbtI5gEAXPLDf37ydwjycy6x6jM518F0dgot02wBAPCerIN5OrqvcqynFiSTLikM8XcYbonJ3eeRevLDastmcf07sITQn8L/kMwDADjt8J4MncjI9ncYKi6kM+NVlgZOFbMZJtmMc78ldqYMAABwz5HdOf4OoVqIP7rWI/XkRjd067nY2uEeaR9VA/81BABw2raVO/0dgiTp2BHeRXmTKbiNv0MAAABOytyb6+8Qqryo3HTF5uz2SF3uJvPiG0Z7pH1UDSTzAABO275ql79DkCRt3xDh7xCqtuBUp4pZZXb6AAAA3pGRTjLP2+od/NljdeVGOzcD4s+CQsyKS4z0WAyo/BjaAABw2o61nnkjWVHb1zPNwHvMUlArp0oyzRYAAP8ybIayDlSO9fIqq6DSfCUcWemRuoqDo3QitqnLz9WuHyWzmf4U/odX5QAAp2UfDYw3vyTzvCj4PJnMjHwEAKAyKCmyylpi83cYVVqjPd/LYiv2SF0HE7vJMLs+pqpeiziPtI+qg2QeAMBpJYUl/g5BkpSXHaS9W53bcRWuMUVc73RZm8xOHwAAwPNKSeR5VUz2LjXY/6NH6jJk0sGk7i4/ZzJJrbsneSQGVB30rgEAldK8D2v6O4Sqx1RDChvgdHGrYXL6AAAAqEzM1mK12vKeTDI8Ul9WrVQVhtVy+bnkNrUUU4tZKXBEMg8A4LSQ8BB/h2A3/6OaKiwgSeRR4VfLZGLEIwAAlUVQCP9J7y1Ndn+lyIIMj9RlM1m0q/FAt55t07O+R2JA1cLffACA02LjY/wdgl3uiSD99FUNf4dRdZjCZYr8u0uPnN4Aw5kDAAB4XnCIhYSeF9Q7sEQN9y/yWH17kvspL8r1pFyt+lFqmMpsFJyJv/UAAKc169DY3yE4mPN2bRksFeMRpqiHZLLUc+kZwzDL5sRhGHQ3AADwBpPZpNr1o/wdRpWSeGiZmm//2GP15UbV196GaS4/Z7aY1GdoK5lMvBTFmehdAwCc1vz8FH+H4GDnxgjlHLf4O4zKL7izFHGzv6MAAABuiE8OnJkTlV2DfQvVcutseSp9ZjUF6Y+WN8swu95f7ZiWrPgG0R6KBFWN63siAwCqrWadmvg7BAeh4TZFxVr9HUblZgqXKXayW299rTLJ6kR315kyAADAPXUakvCpqODiXLXY/pHqZK7xWJ02k1mbUm9ze3rt+Zc18lgsqHoYmQcAcFqdBrVVKynO32HYpaQWyMJrqQqwyBT7rExBDd162mY4u26eh8MGAAB2dRszMq+iWm55z+OJvD9a3qKjtc9z+dnwmBCl3dlGFgvpGpSPPx0AAJf0HdrL3yHYNW2X7+8QKjGTTDFPyRTWz+0anFkv7/QBAAC8Iy4hkoReBRWGx3usLqs5WBtT79SRup1dfjY0IkiD7j9PNepGeCweVE30rgEALrn87ktlDpA3hXXqlfg7hEoqSKbY52WKuNrfgQAAAA9o29O1TazgqDDMMzNPcqKTtbLTwzpau53Lz0bEhujKhzqqdn2mTePcAuO/xgAAlUadhvG6YEBHf4chSQoJZStbl5kTZYp7S6bwgRWuyiaT0wcAAPCepp3qKiwq2N9hVFo2c8W+O6spSDsbD9LKjqN1MjLJ5ecTm8bq6jGdVKseOxPDOaw0BABw2eD7+uvXr1b6OwyZeSXlmvBrZYp+VCazZzqKVsMkq+HEBhhOlAEAAO6zBJvVunuSVn+/19+hVFLudSptpiBlxp+n3cn9lR+Z6PLzQSFm/e2KFLXrXV8mM/0lOI9kHgDAZR0vaaee13XVko9/9WscJcV0epxibiBT7JMyhfbwdyQAAMBLOqUla/vvR5SbVejvUCodk+Ha0i2FoXE6kNRdBxO7qSTEvfUK67eMU88bW6hGHdbHg+tI5gEA3PKPV+7QusWbdSIj228x5OdZ/NZ2pWGKkin+B5lMnk98Oru5BRtgAADgfSFhQbr4llb68qU1EjvJuyQ7urEO1+2s6Nx9Ci/IlNmw2u9ZzcE6GZmo3KiGyo1uqJzoBsqLqi+Z3O/f1G9VU1c80N4DkaO6IpkHAHBLbO0Y3f/qHfrXtS/4LYbdm8P81nalEdzeK4k86b9r5jkxhZY18wAA8I36LeLUpkc9bfzpgL9DqVRyYxtrc2zj/10wbDLbrLKZgyQv9KMatPTMhhuovnhVDgBwW4+r/6a02y72W/vb1zMt4ZyCU/0dAQAA8KGuV6WoVr1If4dRuZnMslmCvZLIk6T4huxYi4ohmQcAqJCRr9+lC6/s4pe2j+wPUfYxptqejSm4jdfqNpzcydZgZB4AAD4TEhakgfe3V2x8uL9DQTlI5qGiSOYBACrEYrHo8Q9Gqsc1f/NL+9vXMTqvXKYIKeRCr1VvM0xOHwAAwHciY0M1eFQHxSXQTwo0Sc1qKCwy2N9hoJIjmQcAqLDgkGA9/sFIDRrRz+dt/zQ31udtVhphA2UyR/k7CgAA4AdRcWG68qGOSmxKXymQtOlZz98hoAogmQcA8AiLxaJ/vHKHnv7uccU3qOWzdhfPiVPuCabalsUUcaNX6z+9m60zBwAA8L3w6BBdOaqjLrymqYKC+T32t4iYEDXpEO/vMFAF8LcZAOBRnfu115sbpqr/7X180l5RgVk/fMKOYGcI7ihTcCuvNsE0WwAAAp/JbFL7Sxrq+n92UUITRun5U+vuSbJYSMOg4vhTBADwuMiYCI168x5NX/2sulzW0evtfT2rtmw2rzdTiZhkinrQ6604s/nF6cMVVqtV48aNU+PGjRUeHq6UlBRNnDhRhmHYywwbNkwmk8nhSEtLs9/fs2ePbr/9doc6xo8fr+LiYocyf63DZDLpt99+q/iXAwBAgKlRN0JXje6oASPaKYbNMXwuPDpY7S6u7+8wUEUE+TsAAEDV1bR9Y02a+6ge6j1e65ds9lo7B3aF6rvZNTXg5mNea6NSibhRptAL/B2F26ZMmaLp06dr1qxZSk1N1cqVK3XrrbcqNjZW999/v71cWlqaZsyYYT8PDQ21//OWLVtks9n0+uuvq2nTptq4caPuvPNOnTx5Us8//7xDez/88INSU1Pt57Vq+W6aOAAAvmQym9SoXW3VbRKj2eN/U9HJUn+HVG30vLGFwqNC/B0GqgiSeQAAr3t45n26q91Dys8t8Fobb/4rSZ165SqhQYnX2qgULPVlihrjk6acnULr6jTbZcuW6YorrtCAAQMkSY0aNdIHH3ygFStWOJQLDQ1VQkJCmXWkpaU5jNRr0qSJtm7dqunTp5+RzKtVq1a59QAAUBWFR4Wo140tNe/Njf4OpVpoen4dpXSo4+8wUIUwzRYA4HV1k+N1x5S/e7WNgpMWvTS6gVfbCHxmmWImy2SO8Elrrq6Zl5OT43AUFRWVWW+3bt20cOFCbdu2TZK0bt06LV26VP3793cot3jxYtWpU0ctWrTQ8OHDlZWVddZ4s7OzVbNmzTOuDxo0SHXq1FH37t311VdfufNVAABQ6TTtVEcpHUkweVtETIguGtLc32GgiiGZBwDwicvvvlQdL2nr1TbW/Bytr2ZU3ymSppiJAT29tkGDBoqNjbUfkydPLrPc2LFjNWTIELVs2VLBwcHq0KGDRo4cqZtuusleJi0tTe+++64WLlyoKVOmaMmSJerfv7+sVmuZde7YsUPTpk3T3Xffbb8WFRWlF154QZ988om++eYbde/eXYMHDyahBwCoNnre2Fzh0cH+DqPKCgkP0uX/OI/ptfA4ptkCAHzCZDLpnx+N0p1tRynr4HGvtTP9iXqKTypR1345XmsjEJmiH5cp4lqftunqNNt9+/YpJibGfv3Pa9z92ccff6zZs2fr/fffV2pqqtauXauRI0cqKSlJQ4cOlSQNGTLEXr5t27Zq166dUlJStHjxYvXp47iT8oEDB5SWlqZrr71Wd955p/167dq1NWrUKPt5586ddfDgQT333HMaNGiQE98AAACVW3hUiC6/7zx9OmWlDDYT86iQMIsuv+88xTeI9ncoqIIYmQcA8JnouCil3XaxV9uwWU2adHeyViysLh0ns0wxk2SKHOrzll2dZhsTE+NwlJfMGzNmjH10Xtu2bXXzzTfrwQcfLHckn3RqTbzatWtrx44dDtcPHjyo3r17q1u3bnrjjTfO+ZkuuOCCM+oAAKAqq5McozoNq0u/yTfCooI1eFRHJabE+jsUVFEk8wAAPhUaXnYCx5NKis168tbGWjSnhtfb+qujh8OUfcz7n1GSZGksU83ZPh+R5235+fkymx27KBaLRTZb+UMG9u/fr6ysLCUmJtqvHThwQL169VKnTp00Y8aMM+osy9q1ax3qAACgOggOq9qT9ixBrm3GVRENU2vp+sc7K54EKbyoav+NBQAEnKAQ3/z0WEtNemZEstb8HKW7xx9UZIy3546YpIhbFH/eKMkokZEzSSr8wkttmaWIoTJFPyiTKcxLbZybIcmmc3eODRfrHThwoCZNmqSGDRsqNTVVa9as0dSpU3XbbbdJkvLy8jRhwgRdffXVSkhI0M6dO/Xwww+radOm6tevn6T/JfKSk5P1/PPPKzMz017/6Z1rZ82apZCQEHXo0EGS9Pnnn+udd97RW2+95WLEAABUbpagqjnOJyTMoguvaabW3ZN07NBJ/fjuHzqy2ztLsYSEB6n7tU3VqluSV+oH/oxkHgDAp8KjfJt8mvdBLa1eEq2Rz+3X+b1zvdOIJVmm2KdlCul86twULlONKTKKBsjImy6VrPJQQyYppLtMUffJFNLBQ3W6z9U185w1bdo0jRs3TiNGjFBGRoaSkpJ0991364knnpB0apTe+vXrNWvWLJ04cUJJSUnq27evJk6caJ+6u2DBAu3YsUM7duxQ/fr1Heo3jP+lFydOnKi9e/cqKChILVu21EcffaRrrrnGpXgBAKjsgkMt/g7B4xq0rqnef2+p6Jqn+p41EyN11ZhO2rhkv9b9uF85mQUeaScoxKzmneuq8+VNFBXno9kZqPZMxp97tD6Sk5Oj2NhYZWdnOyyEDQCo+jb8/IdG9XzCL233vOK4rrzjqFp1yvdMheZ6MkVcL0UOlckUXm4xo2SrjPz3pcKvJOOk6+2YakjhV8sUcYNMQQ3dj9dF5f1en75+8Tf3KCjy3J3W0pNF+nHAa/zuBzD6ZgBQva2Yu1u/z93t7zA8om7jGLW7uL6ad04ot4xhGErffEwblxzQ3g1H5U5WpEbdCLW5qJ5adk1QaITvdgTmNxsSI/MAAD7WtEMjmc0m2Ww+f5ekJV/GacmXcUppk6+BQ7PU+8oTCotwdfrtf0fHRdwkhfaSyXTuaSmm4BYyxU6QET1GKlkrlWyUUbJJKt0kWfefWb+lkRScKlNwqhTURgppL5Mp8N70emtkHgAA8K3KvgFGUIhZzTrXVdue9Z1aq85kMik5tZaSU2vp5IkiHd6VrYz0XGWm5ypzb64KT5Y4lDcHmVQrKUrxydGq0zBadZJjVLtBlEwm+jjwD5J5AACfCo8KV73mSdq35YDfYti5MUIvjYnQv8fVU0pqgZq1K1Czdvlq2rZAtRJKFBJqyGQ2VFJoVm62RSFR7RTfqIcU3PZUks0c51a7JnOUFNpdCu1uX2nOMEoko0hSsaQQyRQmk6ly/DyTzAMAoGqIT658ybxm59dRQkrsqcRa/SgFhbg3VTiyRqhSOtZRSsc69mvWEptKS6wybJIlxKygILNMZvozCByV478WAABVSvPzm/g1mXdacaFZf6yK1B+rIs9a7t2dL8oUVdcrMZhMwZLJd1MzPIlkHgAAVUNkbKgiY0N0MrvY36E4JbpWmPre0cZr9VuCzbIEV81NQVA18KcTAOBzXQd29ncITmvUpoESG3snkQcAABAoGp0X7+8QnNaoXW1/hwD4Fck8AIDPXTi4s2omujdV1dcGDe/n7xAClmGYnD4AAEBga3NRPX+H4LS2PStPrIA3kMwDAPhcUHCQ+t9+sb/DOKeI6HBdcvNF/g4jYNlkcvoAAACBrXb9KCWmxPo7jHOq1yJOcQlnXyIFqOpI5gEA/OLyuy+VJci9hYp9pc/fL1J4VLi/wwAAAPCJNpVgxBuj8gCSeQAAP6ldr5Yuv/tSf4dRrojocA155Ap/hxHQTm+A4cwBAAACX0qnOqpVP8rfYZSrTnK0GrevPGv7Ad5CMg8A4De3P3OTEhrX8XcYZbrz2ZtVpyGdxbNhzTwAAKoWi8WsPkNbyWwOvN9uc5BJFwdobICvkcwDAPhNeGSYRr89QiZTYHXKOl7SNqBHDQIAAHhLfINodeyf7O8wztB5QGPVSgrcUYOAL5HMAwD41Xm9UjVoRODsGBsRE66H3hru7zAqBabZAgBQNZ1/WSPVbhA4ibM6ydHq2C/wEoyAv5DMAwD43T1Th+qCAR39HYZCwoL1rzmPML3WSUyzBQCgarJYzBowop2ia4b5OxTFxIfrshHtmF4L/AnJPACA3wUFB2ncx6PU8ZK2foshODRYT3zykM7rleq3GAAAAAJFVFyYBo1sr6i4UL/FEF0zTFc80F6Rsf6LAQhEJPMAAAEhNDxUE79+VN2u6OzztsOjwjTpm0d1wYBOPm+7MjOcnGLLyDwAACqnGnUidOXojoqtE+7ztuMSInTVmI6Kqe37toFARzIPABAwQkKDNf6z0brr2ZsVEhbskzZbd22uV39/Rh0u9t+owMrKkGQYThz+DhQAALgtpla4rh17vlr+LcFnbbbqlqirHzlfUXH+n+YLBKIgfwcAAMCfmc1mXTt6kC64vJOev+1V/fHbdq+0ExIWrGETb9DVDw6Q2cy7LXfYZJJJ5x51Z3OiDAAACFyhEcHqM6y1UjrW0eLZW3Qyu9gr7UTFharX31sqObWWV+oHqgqSeQCAgNSwZT29+PNEffXqPH3+8jc6vDvDI/VagizqNrizbp04RA1a1PNInQAAANVBo3a1dUPTC/T73D3649dDKi4o9Ui9oRFBatktUZ0HNFZoOGkK4Fz4WwIACFgWi0VX3n+ZrrgvTb9/t0ZfTZ+nld+vlc3m+sTN2vVq6rI7LlH/O/uodlJNL0Rb/Ti7Uy1r5gEAUHWERgSr+3XNdMEVTbRtxWFtWHJAWfvz3KorvmG02lxUT8261FVwiMXDkQJVF8k8AEDAM5vNumBAJ10woJMy9h3Vpl+2avuqndq+epe2r96tk9n5DuVNJpPqN09Us05N1LxTipp1aqLUbi1kCaKT6Ek2wySTE4k6G8k8AACqnOBQi1J71FNqj3o6uj9PR3ZnKyM9V5l7c5V1ME+2UseXr5Ygs2rVi1R8cozqNIxW3cYxqlUvyk/RA5UbyTwAQKVSp0Ft1RlSW72HXChJMgxDJ7PzVVxYLJvVpuDQYIVHhSkkLMTPkQIAAFQPtetHqXb9KKX+99xmtam40CpriU2SZAk2KyTMIrOFdYoBTyCZBwCo1Ewmk6JqREqK9Hco1c7p3WqdKQcAAKoPs8WssEgSd4C3kMwDAABuYc08AAAAwPdIlQMAAAAAAACVBCPzAACAWxiZBwAAAPgeyTwAAOAWdrMFAAAAfI9ptgAAAAAAAEAlQTIPAAC45fRuts4cAAAAgDeYTCbNmTPH32H4FNNsAQCAW04l6pxZM88HwQAAAKBaOnTokOLi4vwdhk+RzAMAAG5hAwwAAICqx2oztGL3MWXkFqpOdJi6NK4pizlw+3MJCQn+DsHnmGYLAAAAAAAAfb/xkLpP+VE3vPmbHvhwrW548zd1n/Kjvt94yLvtfv+9unfvrho1aqhWrVq6/PLLtXPnTklScXGx7rvvPiUmJiosLEzJycmaPHmy/dm/TrN95JFH1Lx5c0VERKhJkyYaN26cSkpK7PeffPJJtW/fXu+9954aNWqk2NhYDRkyRLm5uV79jJ5EMg8AALjFcOEAAABAYPt+4yEN/89qHcoudLh+OLtQw/+z2qsJvZMnT2rUqFFauXKlFi5cKLPZrCuvvFI2m03/93//p6+++koff/yxtm7dqtmzZ6tRo0bl1hUdHa2ZM2dq8+bNevnll/Xmm2/qxRdfdCizc+dOzZkzR3PnztXcuXO1ZMkSPfPMM177fJ7GNFsAAOAWptkCAABUDVaboQlfby7zJawhySRpwtebdWnrBK9Mub366qsdzt955x3Fx8dr8+bNSk9PV7NmzdS9e3eZTCYlJyefta5//vOf9n9u1KiRRo8erQ8//FAPP/yw/brNZtPMmTMVHR0tSbr55pu1cOFCTZo0yYOfynsYmQcAAAAAAFCNrdh97IwReX9mSDqUXagVu495pf3t27frhhtuUJMmTRQTE2MfeZeenq5hw4Zp7dq1atGihe6//37Nnz//rHV99NFHuvDCC5WQkKCoqCj985//VHp6ukOZRo0a2RN5kpSYmKiMjAyPfy5vIZkHAADcwzxbAACAKiEjt/xEnjvlXDVw4EAdO3ZMb775ppYvX67ly5dLOrVeXseOHbV7925NnDhRBQUFuu6663TNNdeUWc+vv/6qm266SZdddpnmzp2rNWvW6PHHH1dxcbFDueDgYIdzk8kkm83mlc/mDUyzBQAA7nFymq2YZgsAABDQ6kSHebScK7KysrR161a9+eab6tGjhyRp6dKlDmViYmJ0/fXX6/rrr9c111yjtLQ0HTt2TDVr1nQot2zZMiUnJ+vxxx+3X9u7d6/HY/Y3knkAAAAAAADVWJfGNZUYG6bD2YVlTqowSUqIDVOXxjXLuFsxcXFxqlWrlt544w0lJiYqPT1dY8eOtd+fOnWqEhMT1aFDB5nNZn3yySdKSEhQjRo1zqirWbNmSk9P14cffqjOnTvrm2++0RdffOHxmP2NabYAAMAthuH8AQAAgMBlMZs0fmBrSacSd392+nz8wNZe2fzCbDbrww8/1KpVq9SmTRs9+OCDeu655+z3o6Oj9eyzz+r8889X586dtWfPHn377bcym89MaQ0aNEgPPvig7rvvPrVv317Lli3TuHHjPB6zv5kMw/dd7JycHMXGxio7O1sxMTG+bh4AADihvN/r09cbvfNPmSPOPdXCll+oPbc9xe9+AKNvBgBA5eDt3+zvNx7ShK83O2yGkRgbpvEDWyutTaLH24N7mGYLAAAAAAAApbVJ1KWtE7Ri9zFl5BaqTvSpqbXeGJEH95HMAwAA7jFMzm1uwQYYAAAAlYbFbFLXlFr+DgNnQTIPAAC4xdn18FgzDwAAAPAcknkAAMA9xn8PZ8oBAAAA8Ah2swUAAAAAAAAqCUbmAQAAtxiGSYYT6+E5UwYAAACAc0jmAQAA9zGFFgAAAPApptkCAAAAAAAAlQQj8wAAgFuYZgsAAAD4HiPzAACAewwXDgAAAKAcvXr10siRI8u936hRI7300ks+ay/QkcwDAAAAAAAAKgmm2QIAADeZ/ns4Uw4AAACVgs0q7V0m5R2RoupKyd0ks8XfUeFPGJkHAADcwzRbAACAqmXzV9JLbaRZl0uf3X7qf19qc+q6l5WWluq+++5TbGysateurXHjxskwyu5ITp06VW3btlVkZKQaNGigESNGKC8vz6HML7/8ol69eikiIkJxcXHq16+fjh8/XmZ933zzjWJjYzV79myPfy5vIJkHAAAAAABQ3W3+Svr4FinnoOP1nEOnrns5oTdr1iwFBQVpxYoVevnllzV16lS99dZbZZY1m836v//7P23atEmzZs3Sjz/+qIcffth+f+3aterTp49at26tX3/9VUuXLtXAgQNltVrPqOv999/XDTfcoNmzZ+umm27y2ufzJKbZAgAA9zg76o6ReQAAAIHNZpW+f0Rld9wMSSbp+7FSywFem3LboEEDvfjiizKZTGrRooU2bNigF198UXfeeecZZf+8eUWjRo301FNP6Z577tG///1vSdKzzz6r888/334uSampqWfU8+qrr+rxxx/X119/rZ49e3r+Q3kJyTwAAOAew3TqcKYcAAAAAtfeZWeOyHNgSDkHTpVr3MMrIfztb3+TyfS/fmPXrl31wgsvlDma7ocfftDkyZO1ZcsW5eTkqLS0VIWFhcrPz1dERITWrl2ra6+99qztffrpp8rIyNAvv/yizp07e/zzeBPTbAEAgFsMw/kDAAAAASzviGfLedGePXt0+eWXq127dvrss8+0atUqvfrqq5Kk4uJiSVJ4ePg56+nQoYPi4+P1zjvvlLs2X6ByOZlXUFCgpUuXavPmzWfcKyws1LvvvuuRwAAAAOAc+mcAAKBCoup6tpwbli9f7nD+22+/qVmzZrJYHKf1rlq1SjabTS+88IL+9re/qXnz5jp40HFUYbt27bRw4cKztpeSkqJFixbpyy+/1D/+8Q/PfAgfcSmZt23bNrVq1UoXXXSR2rZtq549e+rQoUP2+9nZ2br11ls9HiQAAAhA7GYbEOifAQCACkvuJsUkSSpveRSTFFPvVDkvSU9P16hRo7R161Z98MEHmjZtmh544IEzyjVt2lQlJSWaNm2adu3apffee0+vvfaaQ5lHH31Uv//+u0aMGKH169dry5Ytmj59uo4ePepQrnnz5lq0aJE+++wzh3X4Ap1LybxHHnlEbdq0UUZGhrZu3aro6GhdeOGFSk9P91Z8AAAgUJ1eM8+ZA15D/wwAAFSY2SKlTfnvyV/7bv89T3vGa5tfSNItt9yigoICdenSRffee68eeOAB3XXXXWeUO++88zR16lRNmTJFbdq00ezZszV58mSHMs2bN9f8+fO1bt06denSRV27dtWXX36poKAzt45o0aKFfvzxR33wwQd66KGHvPb5PMlkuDAxuG7duvrhhx/Utm1bSZJhGBoxYoS+/fZbLVq0SJGRkUpKSipzccI/y8nJUWxsrLKzsxUTE1OxTwAAALyivN/r09fr/9+/ZA4PO2c9toJC7b//CX73vcQT/TP6ZgAAVA5e/83e/NWpXW3/vBlGTL1TibzWgzzfHtzi0m62BQUFDllMk8mk6dOn67777lPPnj31/vvvezxAAAAQmEzGqcOZcvAe+mcAAMBjWg+SWg44tWtt3pFTa+Qld/PqiDy4zqVkXsuWLbVy5Uq1atXK4forr7wiSRo0iCwtAADVhrPr4ZHM8yr6ZwAAwKPMFqlxD39HgbNwac28K6+8Uh988EGZ91555RXdcMMNlW47XwAAgMqM/hkAAED14tKaeZ7CuiwAAAS+c62Z1+DFiU6vmbfvwXH87gcw+mYAAFQO/GZDcnFkniTt2bNHb775pl599VVt3LjRGzEBAIDKwHDhcIHVatW4cePUuHFjhYeHKyUlRRMnTnQYXTZs2DCZTCaHIy0tzX5/z549uv322x3qGD9+vIqLix3aWr9+vXr06KGwsDA1aNBAzz77rBtfhP/RPwMAAKg+XFozb9GiRbr88stVUFBw6uGgIL3zzjv6+9//7pXgAABA9TNlyhRNnz5ds2bNUmpqqlauXKlbb71VsbGxuv/+++3l0tLSNGPGDPt5aGio/Z+3bNkim82m119/XU2bNtXGjRt155136uTJk3r++eclnXqz3bdvX11yySV67bXXtGHDBt12222qUaOG7rrrLt994AqifwYAAFC9uDQyb9y4cbr00kt14MABZWVl6c4779TDDz/srdgAAEAg89LIvGXLlumKK67QgAED1KhRI11zzTXq27evVqxY4VAuNDRUCQkJ9iMuLs5+73Sir2/fvmrSpIkGDRqk0aNH6/PPP7eXmT17toqLi/XOO+8oNTVVQ4YM0f3336+pU6e68WX4D/0zAACA6sWlZN7GjRv19NNPKzExUXFxcXruueeUkZGhrKwsb8UHAAAClYvJvJycHIejqKiozGq7deumhQsXatu2bZKkdevWaenSperfv79DucWLF6tOnTpq0aKFhg8ffs7+SHZ2tmrWrGk///XXX3XRRRcpJCTEfq1fv37aunWrjh8/7vz34Gf0zwAAAKoXl5J5OTk5ql27tv08IiJC4eHhys7O9nhgAAAgwBkm5w9JDRo0UGxsrP2YPHlymdWOHTtWQ4YMUcuWLRUcHKwOHTpo5MiRuummm+xl0tLS9O6772rhwoWaMmWKlixZov79+8tqtZZZ544dOzRt2jTdfffd9muHDx9W3bp1HcqdPj98+HCFvhpfon8GAABQvbi0Zp4kzZs3T7GxsfZzm82mhQsXOiy2PGjQIM9EBwAAqox9+/Y57Lr25zXu/uzjjz/W7Nmz9f777ys1NVVr167VyJEjlZSUpKFDh0qShgwZYi/ftm1btWvXTikpKVq8eLH69OnjUN+BAweUlpama6+9VnfeeacXPpn/0T8DAACVWa9evdS+fXu99NJL/g5FkjRz5kyNHDlSJ06ckCQ9+eSTmjNnjtauXevXuE5zOZl3uhP9Z39+y20ymcp9Kw4AAKoOk3HqcKacJMXExDgk88ozZswY++g86VSybu/evZo8eXKZ/RBJatKkiWrXrq0dO3Y4JPMOHjyo3r17q1u3bnrjjTccnklISNCRI0ccrp0+T0hIOPcHCyD0zwAAADzn+uuv12WXXWY/Hz16tP7xj3/4MSJHLiXzbDabt+IAAACVjbObW7i4AUZ+fr7MZseVQCwWy1n7Ifv371dWVpYSExPt1w4cOKDevXurU6dOmjFjxhl1du3aVY8//rhKSkoUHBwsSVqwYIFatGjhsJlGoKN/BgAAPMlqs2p1xmpl5mcqPiJeHet0lMVs8XdYPhUeHq7w8HD7eVRUlKKiovwYkSOX1sw7F5vNprlz53qySgAAUM0MHDhQkyZN0jfffKM9e/boiy++0NSpU3XllVdKkvLy8jRmzBj99ttv2rNnjxYuXKgrrrhCTZs2Vb9+/SSdSuT16tVLDRs21PPPP6/MzEwdPnzYYS28G2+8USEhIbr99tu1adMmffTRR3r55Zc1atQov3xub6F/BgAAnPXD3h/U77N+um3ebXrk50d027zb1O+zfvph7w9eb9tms+nhhx9WzZo1lZCQoCeffNJ+b+rUqWrbtq0iIyPVoEEDjRgxQnl5eZIkwzAUHx+vTz/91F6+ffv2Di95ly5dqtDQUOXn55+zPunUNNsaNWrYz5988km1b9/efv7777/r0ksvVe3atRUbG6uePXtq9erVDp/HZDLprbfe0pVXXqmIiAg1a9ZMX331lSe+Ks8k83bs2KHHHntM9evXt3e0AQAA3DFt2jRdc801GjFihFq1aqXRo0fr7rvv1sSJEyWdGqW3fv16DRo0SM2bN9ftt9+uTp066eeff7avw7dgwQLt2LFDCxcuVP369ZWYmGg/TouNjdX8+fO1e/duderUSQ899JCeeOIJ3XXXXX753J5G/wwAALjih70/aNTiUTqS77gMSUZ+hkYtHuX1hN6sWbMUGRmp5cuX69lnn9W//vUvLViwQJJkNpv1f//3f9q0aZNmzZqlH3/8UQ8//LCkU0mziy66SIsXL5YkHT9+XH/88YcKCgq0ZcsWSdKSJUvUuXNnRUREnLM+Z+Tm5mro0KFaunSpfvvtNzVr1kyXXXaZcnNzHcpNmDBB1113ndavX6/LLrtMN910k44dO1bRr0omwzBcnPxySkFBgT755BO99dZb+uWXX9SjRw8NGTJEV1555Rk7w/1VTk6OYmNjlZ2d7dTaOQAAwPfK+70+fT15ylMyh4Wdsx5bYaH2PvJPfvd9wN3+GX0zAAAqB2/9ZlttVvX7rN8ZibzTTDKpbkRdfX/1916ZcturVy9ZrVb9/PPP9mtdunTRxRdfrGeeeeaM8p9++qnuueceHT16VNKpl8Gvv/66Nm7cqC+//FKTJ09WQkKC0tLSdM899+jSSy9Vly5dNGnSpDLb/2t9rm6AYbPZVKNGDb3//vu6/PLLJZ1KMv7zn/+0v5A+efKkoqKi9N133yktLc2t7+k0l0fm/f7777r77ruVkJCgl156SVdccYVMJpP+/e9/65577jlnIg8AAACeRf8MAABUxOqM1eUm8iTJkKHD+Ye1OmN1uWUqql27dg7niYmJysjIkCT98MMP6tOnj+rVq6fo6GjdfPPNysrKsk+b7dmzpzZv3qzMzEwtWbJEvXr1Uq9evbR48WKVlJRo2bJl6tWrl73uc9V3LkeOHNGdd96pZs2aKTY2VjExMcrLy1N6enq5nykyMlIxMTH2z1QRLiXz2rVrp2uvvVa1atXSsmXLtHr1aj300EMymUwVDgQAAFQyhsn5A15D/wwAAFRUZn6mR8u54/SGZKeZTCbZbDbt2bNHl19+udq1a6fPPvtMq1at0quvvipJKi4uliS1bdtWNWvW1JIlSxySeUuWLNHvv/+ukpISdevWTZKcqu9chg4dqrVr1+rll1/WsmXLtHbtWtWqVeuM58v7TBXl0m62W7du1fXXX6/evXurdevWFW4cAABUYl7azRauoX8GAAAqKj4i3qPlPGnVqlWy2Wx64YUXZDafGpP28ccfO5QxmUzq0aOHvvzyS23atEndu3dXRESEioqK9Prrr+v8889XZGSk0/Wdyy+//KJ///vfuuyyyyRJ+/bts0/R9QWXRubt2rVLLVq00PDhw1W/fn2NHj1aa9as4c0vAACAn9A/AwAAFdWxTkfVjagrk8ruP5hkUkJEgjrW6ejjyKSmTZuqpKRE06ZN065du/Tee+/ptddeO6Ncr1699MEHH6h9+/aKioqS2WzWRRddpNmzZ6tnz54u13c2zZo103vvvac//vhDy5cv10033aTw8PAKf1ZnuZTMq1evnh5//HHt2LFD7733ng4fPqwLL7xQpaWlmjlzprZt2+atOAEAQKAxXDjgNfTPAABARVnMFo3tMlaSzkjonT5/pMsjXtn84lzOO+88TZ06VVOmTFGbNm00e/ZsTZ48+YxyPXv2lNVqdVgb7/TGGn++5mx9Z/P222/r+PHj6tixo26++Wbdf//9qlOnjrsf0WVu72Z7WnZ2tmbPnq133nlHq1evVps2bbR+/fqzPsOOaQAABL5z7WbbaNIkp3ez3fP44/zu+5Cr/TP6ZgAAVA7e/s3+Ye8PembFMw6bYSREJOiRLo/okuRLPN4e3OPSmnlliY2N1YgRIzRixAitXbtW77zzjifiAgAAgY418wIW/TMAAOCOS5IvUe8GvbU6Y7Uy8zMVHxGvjnU6+mVEHspX4WTen7Vv317/93//58kqAQAAUAH0zwAAgCssZos6J3T2dxg4C5eSeRdffPE5y5hMJi1cuNDtgAAAQCXByLyAQP8MAACgenEpmbd48WIlJydrwIABCg4O9lZMAACgEjAZpw5nysF76J8BAABULy4l86ZMmaIZM2bok08+0U033aTbbrtNbdq08VZsAAAAOAf6ZwAAANWL2ZXCY8aM0ebNmzVnzhzl5ubqwgsvVJcuXfTaa68pJyfHWzECAIBAZJicP+A19M8AAACqF5eSead17dpVb775pg4dOqR7771X77zzjpKSkugwAgBQnRguHPA6+mcAAADVg1vJvNNWr16tJUuW6I8//lCbNm1YpwUAAMDP6J8BAABUbS4n8w4ePKinn35azZs31zXXXKOaNWtq+fLl+u233xQeHu6NGAEAQAA6vQGGMwe8i/4ZAABA9eHSBhiXXXaZFi1apL59++q5557TgAEDFBTkUhUAAKCqcHYKLck8r6J/BgAAqqNevXqpffv2eumll/wdis+51NP7/vvvlZiYqPT0dE2YMEETJkwos9zq1as9EhwAAADOjv4ZAACoyhYvXqzevXvr+PHjqlGjhr/DCQguJfOeeOIJmUzsSAcAACQ5O4WWkXleRf8MAAB4kmG1Kn/lKpVmZiooPl4R53eSyWLxd1g+UVxcrJCQEH+HcU4uJfOefPJJL4UBAAAqHabZBgT6ZwAAwFNy5s/Xkacnq/TwYfu1oIQE1X3sUcX07eu1douKijRmzBh9+OGHysnJ0fnnn68XX3xR8fHx6t27tyQpLi5OkjR06FDNnDlTkmSz2fTwww/rrbfeUkhIiO655x6HvtGJEyc0evRoffnllyoqKrLXe95550k61Y+aM2eO7rvvPk2aNEl79+6VzWbz2uf0FJc2wIiLi1PNmjXPOBo3bqx+/fppwYIF3ooTAAAEGsOFA15D/wwAAHhCzvz5OvDASIdEniSVHjmiAw+MVM78+V5r++GHH9Znn32mWbNmafXq1WratKn69eun6OhoffbZZ5KkrVu36tChQ3r55Zftz82aNUuRkZFavny5nn32Wf3rX/9y6Ptce+21ysjI0HfffadVq1apY8eO6tOnj44dO2Yvs2PHDn322Wf6/PPPtXbtWq99Rk9yaWReeYsKnjhxQqtWrdLll1+uTz/9VAMHDvREbAAAADgH+mcAAKCiDKtVR56eLBllvIU1DMlk0pGnJyu6Tx+PT7k9efKkpk+frpkzZ6p///6SpDfffFMLFizQO++8o86dO0uS6tSpc8aaee3atdP48eMlSc2aNdMrr7yihQsX6tJLL9XSpUu1YsUKZWRkKDQ0VJL0/PPPa86cOfr000911113STo1tfbdd99VfHy8Rz+XN7mUzBs6dOhZ77dv316TJ0+mswgAQDVgcnLNPKfW1YPb6J8BAICKyl+56owReQ4MQ6WHDyt/5SpFXtDFo23v3LlTJSUluvDCC+3XgoOD1aVLF/3xxx/2ZF5Z2rVr53CemJiojIwMSdK6deuUl5enWrVqOZQpKCjQzp077efJycmVKpEnuZjMO5fLL79cTz31lCerBAAAQAXQPwMAAOdSmpnp0XK+Ehwc7HBuMpnsa97l5eUpMTFRixcvPuO5P4/wi4yM9GaIXuHRZF5RUVGl2PUDAACguqB/BgAAziXIyZFpzpZzRUpKikJCQvTLL78oOTlZklRSUqLff/9dI0eOtPdjrFarS/V27NhRhw8fVlBQkBo1auTpsP3KpQ0wzuXtt99W+/btPVklAAAIVGyAUSnQPwMAAOcScX4nBSUkSCZT2QVMJgUlJCji/E4ebzsyMlLDhw/XmDFj9P3332vz5s268847lZ+fr9tvv13JyckymUyaO3euMjMzlZeX51S9l1xyibp27arBgwdr/vz52rNnj5YtW6bHH39cK1eu9Pjn8CWXRuaNGjWqzOvZ2dlavXq1tm3bpp9++skjgQEAgMDGmnmBgf4ZAACoKJPForqPPaoDD4w8ldD780YY/03w1X3sUY9vfnHaM888I5vNpptvvlm5ubk6//zzNW/ePMXFxSkuLk4TJkzQ2LFjdeutt+qWW27RzJkzz/2ZTCZ9++23evzxx3XrrbcqMzNTCQkJuuiii1S3bl2vfA5fMRlGWVuVlK13795lXo+JiVGLFi00fPhwNW7c+Jz15OTkKDY2VtnZ2YqJiXE+WgAA4DPl/V6fvt507NOyhIWdsx5rYaF2PPMYv/te4on+GX0zAAAqB2//ZufMn68jT0922AwjKCFBdR97VDF9+3q8PbjHpZF5ixYt8lYcAACgMmLUnd/RPwMAAJ4S07evovv0ObW7bWamguLjFXF+J6+NyIN7PLoBBgAAqEacXQ+PhB8AAEClYbJYFHlBF3+HgbPw6AYYAAAAAAAAALyHkXkAAMAtbIABAAAA+B7JPAAA4B6m2QIAAAA+RzIPAAC4hZF5AAAAgO+xZh4AAAAAAABQSTAyDwAAuIdptgAAAIDPkcwDAADuIZkHAAAA+BzTbAEAAAAAABDwZs6cqRo1apy1zLBhwzR48GCfxOMvjMwDAABuYQMMAAAABJqXX35ZhvG/DmivXr3Uvn17vfTSS/4LysNI5gEAAPcwzRYAAKDKsdkMHdp+QidzihQZE6rEZjVkNpv8HZbTYmNj/R2C1zHNFgAAAAAAANq5JkPvPrZMc15cowVvb9acF9fo3ceWaeeaDK+1OXfuXNWoUUNWq1WStHbtWplMJo0dO9Ze5o477tDf//53+/m8efPUqlUrRUVFKS0tTYcOHbLf+/M022HDhmnJkiV6+eWXZTKZZDKZtGfPHknSxo0b1b9/f0VFRalu3bq6+eabdfToUa99Tk8imQcAANxjuHAAAAAgoO1ck6HvX9+okyeKHK6fPFGk71/f6LWEXo8ePZSbm6s1a9ZIkpYsWaLatWtr8eLF9jJLlixRr169JEn5+fl6/vnn9d577+mnn35Senq6Ro8eXWbdL7/8srp27ao777xThw4d0qFDh9SgQQOdOHFCF198sTp06KCVK1fq+++/15EjR3Tdddd55TN6Gsk8AADgltNr5jlzAAAAIHDZbIZ+/mj7Wcss/Xi7bDbPd+xiY2PVvn17e/Ju8eLFevDBB7VmzRrl5eXpwIED2rFjh3r27ClJKikp0Wuvvabzzz9fHTt21H333aeFCxeWW3dISIgiIiKUkJCghIQEWSwWvfLKK+rQoYOefvpptWzZUh06dNA777yjRYsWadu2bR7/jJ5GMg8AAAAAAKAaO7T9xBkj8v4q73iRDm0/4ZX2e/bsqcWLF8swDP3888+66qqr1KpVKy1dulRLlixRUlKSmjVrJkmKiIhQSkqK/dnExERlZLg2anDdunVatGiRoqKi7EfLli0lSTt37vTcB/MSNsAAAADuYQMMAACAKuFkztkTea6Wc1WvXr30zjvvaN26dQoODlbLli3Vq1cvLV68WMePH7ePypOk4OBgh2dNJpPD7rXOyMvL08CBAzVlypQz7iUmJrr3IXyIZB4AAHCLs1NomWYLAAAQ2CJjQj1azlWn18178cUX7Ym7Xr166ZlnntHx48f10EMPuV13SEiIfXON0zp27KjPPvtMjRo1UlBQ5UuNMc0WAAC4hw0wAAAAqoTEZjUUWePsibqouFAlNqvhlfbj4uLUrl07zZ49277RxUUXXaTVq1dr27ZtDiPzXNWoUSMtX75ce/bs0dGjR2Wz2XTvvffq2LFjuuGGG/T7779r586dmjdvnm699dYzEn+BiGQeAAAAAABANWY2m9Tj+mZnLdP9umYym01ei6Fnz56yWq32ZF7NmjXVunVrJSQkqEWLFm7XO3r0aFksFrVu3Vrx8fFKT09XUlKSfvnlF1mtVvXt21dt27bVyJEjVaNGDZnNgZ8qMxmuTiz2gJycHMXGxio7O1sxMTG+bh4AADihvN/r09dbjXhaltCwc9ZjLSrUH/9+jN/9AEbfDACAysHbv9k712To54+2O2yGERUXqu7XNVNKhzoebw/uqXwTgwEAQEAw/fdwphwAAAACX0qHOmp8Xvyp3W1zihQZc2pqrTdH5MF1JPMAAAAAAAAg6dSU23ot4vwdBs6CZB4AAHCPs5tbsAEGAAAA4DEk8wAAgFtMxqnDmXIAAAAAPCPwt+gAAAAAAAAAIImReQAAwF1MswUAAAB8jmQeAABwH4k6AAAAwKeYZgsAAAAAAABUEozMAwAAbmEDDAAAAMD3SOYBAAD3sGYeAAAA4HMk8wAAgFsYmQcAAAD4HmvmAQAAAAAAAJUEI/MAAIB7mGYLAAAA+BzJPAAA4Bam2QIAAAC+xzRbAAAAAAAAoJIgmQcAANxjuHC4wGq1aty4cWrcuLHCw8OVkpKiiRMnyjD+V9GwYcNkMpkcjrS0NId6Jk2apG7duikiIkI1atQos62/1mEymfThhx+6FjAAAADgQ0yzBQAA7vHSmnlTpkzR9OnTNWvWLKWmpmrlypW69dZbFRsbq/vvv99eLi0tTTNmzLCfh4aGOtRTXFysa6+9Vl27dtXbb79dbnszZsxwSASWl/gDAAAAAgHJPAAAEFCWLVumK664QgMGDJAkNWrUSB988IFWrFjhUC40NFQJCQnl1jNhwgRJ0syZM8/aXo0aNc5aDwAAABBImGYLAADccnoDDGcOScrJyXE4ioqKyqy3W7duWrhwobZt2yZJWrdunZYuXar+/fs7lFu8eLHq1KmjFi1aaPjw4crKynLrc9x7772qXbu2unTponfeecdhOi8AAAAQaBiZBwAA3OPiNNsGDRo4XB4/fryefPLJM4qPHTtWOTk5atmypSwWi6xWqyZNmqSbbrrJXiYtLU1XXXWVGjdurJ07d+qxxx5T//799euvv8pisTj9Ef71r3/p4osvVkREhObPn68RI0YoLy/PYTovAAAAEEhI5gEAAJ/Yt2+fYmJi7Od/XePutI8//lizZ8/W+++/r9TUVK1du1YjR45UUlKShg4dKkkaMmSIvXzbtm3Vrl07paSkaPHixerTp4/TMY0bN87+zx06dNDJkyf13HPPkcwDAABAwGKaLQAAcIvJMJw+JCkmJsbhKC+ZN2bMGI0dO1ZDhgxR27ZtdfPNN+vBBx/U5MmTy42lSZMmql27tnbs2FGhz3TBBRdo//795U4BBgAAAPyNkXkAAMA9XtrNNj8/X2az4/tGi8Uim81W7jP79+9XVlaWEhMTXWvsL9auXau4uLhyE40AAACAv5HMAwAAbvnz5hbnKueKgQMHatKkSWrYsKFSU1O1Zs0aTZ06VbfddpskKS8vTxMmTNDVV1+thIQE7dy5Uw8//LCaNm2qfv362etJT0/XsWPHlJ6eLqvVqrVr10qSmjZtqqioKH399dc6cuSI/va3vyksLEwLFizQ008/rdGjR7sWMAAAAOBDJPMAAEBAmTZtmsaNG6cRI0YoIyNDSUlJuvvuu/XEE09IOjVKb/369Zo1a5ZOnDihpKQk9e3bVxMnTnQYUffEE09o1qxZ9vMOHTpIkhYtWqRevXopODhYr776qh588EEZhqGmTZtq6tSpuvPOO337gQEAAAAXmAzDcPF9ecXl5OQoNjZW2dnZDgthAwCAwFHe7/Xp6x1unCRLSNg567EWF2rN+4/zux/A6JsBAFA58JsNiZF5AADATd6aZgsAAACgfOxmCwAAAAAAAFQSjMwDAADu8dJutgAAAADKRzIPAAC4hWm2AAAAgO8xzRYAAAAAAACoJBiZBwAA3MM0WwAAAMDnSOYBAAC3MYUWAAAA8C2SeQAAwD2GcepwphwAAAAAj2DNPAAAAAAAAKCSYGQeAABwC7vZAgAAAL5HMg8AALiHDTAAAAAAn2OaLQAAAAAAAFBJMDIPAAC4xWQ7dThTDgAAAIBnkMwDAADuYZotAAAA4HNMswUAAAAAAAAqCUbmAQAAt7CbLQAAAOB7JPMAAIB7DOPU4Uw5AAAAAB7BNFsAAAAAAACgkmBkHgAAcAvTbAEAAADfI5kHAADcw262AAAAgM+RzAMAAG5hZB4AAADge6yZBwAAAAAAAFQSjMwDAADuYTdbAAAAwOdI5gEAALcwzRYAAADwPabZAgAAAAAAAJUEI/MAAIB72M0WAAAA8DmSeQAAwC1MswUAAAB8j2m2AAAAAAAAQCXByDwAAOAem3HqcKYcAAAAAI8gmQcAANzDmnkAAACAzzHNFgAAAAAAAKgkGJkHAADcYpKTG2B4PRIAAACg+iCZBwAA3GMYpw5nygEAAADwCJJ5AADALSbDyZF55PIAAAAAj2HNPAAAAAAAAKCSYGQeAABwD7vZAgAAAD5HMs8DMtIztXfzfhWeLFJJcaksQRaFRYQoMSVB9ZsnymxmACQAoOoxGYZMTqyH50wZwJNsNkPHD59U7tFClZbYZLPaZAk2KzjUolpJUYqsEervEAEAANxGMs8Nfyzfrl+/+l3bV+/S9lW7lH00t9yyEdHhSmnfSM06NlHHS9upc1p7knsAAAAeZLPatGd9lvZvO67Mvbk6uj9XpcW2cstHxIQoPjla8Q2j1aR9vOIbRPswWgAAgIohmeekwvwi/fj+Us19bZ62r97t9HP5uQXa8PMf2vDzH/r85W+U0CheA+66VP3v6KPY2jFejBgAAC+z/fdwphzgBSezi7Tp54Pa/PMBncwudvq5/Jxi7d2Qpb0bsrTymz1KaBKjNj3rq2nHOrIE89IVAAAENpJ552C1WvXpC3P14TNfKO/EyQrXd3hPpt5+7H29O+ET9b/9Yt3xzE0Kjwr3QKQAAPgW02zhL4UnS/TLZzu07bfDstkq/ufr8K4cHd61WUs/2a4ulzdWm571ZDKZPBApAACA55HMO4u9f+zX87e+qi0rdni87pKiEn3173la/s1qjXpruDr2aevxNgAAAKqa3esytXj2VuXnOD8Sz1mFeSX66cNt2rk6Qxff0koxtXnhCgAAAg/zCMpgGIY+evZLDe/4sFcSeX92ZG+mxvadqJeHv6GigiKvtgUAgEcZLhxABRUXlGrBjE36dvoGryTy/uzAthP6YOIKbVyy36vtAAAAuIOReX9htVr10l2v6/sZi3zWpmEYmvv6Au3dvF8Tvx6ryJgIn7UNAIDbDOPU4Uw5oAIKcov19bR1ykwvf9MxTystsmrJB9t0IrNA3a9p5rN2AQAAzoWReX9is9n0zM3TfJrI+7MNP/+hhy/5l07m5PulfQAAgEBTkFusL15Y7dNE3p+t+2GfFr+/1S9tAwAAlIVk3p+8fM8bWvzhL36NYdvKnRo36Bmm3AIAAp7JcP4A3FFUUKqv/m+tjh/274vOTT8d0K9feHfpFQAAAGcxzfa/vn3zB3371kL7ec26JWrSqkCRMVYFhxqyWU0qLjIp82Cwdm0OV0mR9/KgG376Q2+MeU//eOUOr7UBAECFMc0WXrZk9hYd3Zfn7zAkSavnpatOoxildKjj71AAAEA1RzJPUsa+o/rmtdf094eOqHm7fDVtV6BadUvLLV9aIqVvC9O29RHauDxSP31dQ0UFnk3ufT19vi66pqvO65Xq0XoBAPAUk+3U4Uw5wFU712Ro+8oMf4fhYMkH21SvWZzCooL9HQoAAKjGqlwyLz+3QNtX71JG+lEVFxTLZrUpODRYETHhaty2oeo3T5LJZJIkGUapVLRAOTue1rRvjjjdRlCw1CS1UE1SC5V2wzHdM+GAFnxSU3Nn1dL+nWEe+RyGYej52/+tN9a/oPBIz9QJAADgDyezi5SZnquC3BJZS09ldy1BZoVHByu+YbQiY0PtZYvyS7R+0X79/s1uf4VbroKcYv304Vb1vaONv0MBAADVWKVP5uXnFmjRB0u1/qfN2r5ql/ZvOyTjLNN5ImLC1bRDY11280l177tUwUHH1KRVxWKIirXpyjuO6so7jmr5gmi98lh9ZRwIqVilkg7vztCMxz/QiJdurXBdAAB4HNNsUY7cY4Xa+tthHdmdrYz0XOVnF5+1fERsiOIbRKu4sFQZe3NkLQncPzPbV2aoWedMNT4v3t+hAACAaqrSJvN2b0zX1/+ep4Wzf1Z+boHTz4WF5eiqW79R1745Xonrgktz1eZvW/XWxCR9+59aFa7v2zd/0M3jr1V0XJQHogMAwIOM/x7OlEOVZxiG0jcf08YlB7R3Y5YMm/P/4vOzi7U3O8uL0XnW6nl7SeYBAAC/qXTJvMN7MvTy8De0ct46l5/tecVx/ePpA4qOs3ohsv+JjLbpgWf366KBJ/TcAw2Vddj9dVWKCoo1b8YiXTNqoAcjBAAA8Jz9W49ryftbdeKIf3ed9ZXDu3KUuS9X8Q2i/R0KAACohry3JauHGYahr/49T3e1e8itRN5192bosenpXk/k/VmHHnl68avtSmpcVKF65r6+4KxThwEA8AeTYTh9oGoqLizVkve36suX1lSbRN5pG5cc8HcIAACgmqoUybyjB7L08CUTNO2+t1SQV+jy8zc9eFi3P37IC5GdW936JXrhix2q18T9hN6B7Ye0ZuEGD0YFAIAHnF4zz5kDVc7BHSf04cQV2vjTgWo5lXrb70dUXFjq7zAAAEA1FPDJvP3bD+mBC/+ptYs2ufX84NszdcsY53eq9YaadUr1zEc7FV/v7Is/n82aHzd6MCIAAAD37Vqbqa9eWqvcLNdfslYVpUVWHdntnTWYAQAAziagk3n7tx/SQz2fUEb6Ubeeb33+Sd315EEPR+WeOvVK9Oi/98pkcu/V9fbVuzwcEQAAFWRIsjlxVMNRW1XZjlUZmvfGRllLbf4Oxe8y03P9HQIAAKiGAjaZl7k/S49c+i8dO3zCreeDQ20a9eI+WSyejasiUjvn66q7Mt16dvsqknkAgMDCmnnVT/rmLC2YsUk2F3aqrcoy9pLMAwAAvheQyTzDMDT5ppfdHpEnScMeOawGKRXbeMIbbnn4sFvr5+Vk5erwngwvRAQAAHBuJ7OLNP/tTbKVksg7LTOdabYAAMD3AjKZN2fad9rw8x9uP9/8vHxdead7I+C8LSzc0Kip6W49e2iXf9f+AwDAgSEnN8Dwd6DwhCXvb1XRSTZ8+LOco4UyGHkKAAB8LOCSeQd2HNI7j71foTquHZ4RUNNr/6pNl3y17nzS5eeKC9zfQAMAAI9jN9tqY+vyw9q9zv0ZE1WZtYS1AwEAgG8FXDLvpbtfV2G++9Nja9YpUde0wJ/yMHCo6x3i0hKrFyIBAMBNzmx+cfpApVV4skQ/f7zN32EELJuVZDUAAPCtgErmbV25U2sXbapQHf1vylJwSOB3qroPyFZsTdemqgSHBnspGgAAgLJtXnqQ6bVnYQkKqO40AACoBgKq9/H1v+dVsAZD/W865pFYvC0k1FDfIa7FGh4V5qVoAABwHbvZVn2GzdCmnw/4O4yAZTabZAkOqO40AACoBgKm95F7PE+LP/qlQnXUTylSfFKJhyLyvvYX5rlUPjm1vpciAQDADayZV+Xt3ZSlnKOF/g4jYMUlRfo7BAAAUA0FTDLvh/d+UlEFN3ho1q7AQ9H4RtN2+U6XTWgUr5ia0V6MBgCAwGC1WjVu3Dg1btxY4eHhSklJ0cSJEx12DR02bJhMJpPDkZaW5lDPpEmT1K1bN0VERKhGjRpltpWenq4BAwYoIiJCderU0ZgxY1RaypTS0zYvPejvEAJanYb0zQAAgO8F+TuA09Yt3ljhOpq5kBwLBDVqWRVfr1iZB0LOWbZZpyY+iAgAABc4O+rOxZF5U6ZM0fTp0zVr1iylpqZq5cqVuvXWWxUbG6v777/fXi4tLU0zZsywn4eGhjrUU1xcrGuvvVZdu3bV22+/fUY7VqtVAwYMUEJCgpYtW6ZDhw7plltuUXBwsJ5++mmXYq6KDMPQgW0n/B1GQIsnmQcAAPwgYJJ521btqnAdlW1kniQ1a1vgXDKvY4oPogEAwAVeSuYtW7ZMV1xxhQYMGCBJatSokT744AOtWLHCoVxoaKgSEhLKrWfChAmSpJkzZ5Z5f/78+dq8ebN++OEH1a1bV+3bt9fEiRP1yCOP6Mknn1RIyLl/n6uy7MwCFRcwSvFs4pNJ5gEAAN8LiGm2JzKzlbkvq8L1JDSs2DRdf0hMLnKqXNdB53s5EgAAvCsnJ8fhKCoq+zewW7duWrhwobZt2yZJWrdunZYuXar+/fs7lFu8eLHq1KmjFi1aaPjw4crKcq0v8euvv6pt27aqW7eu/Vq/fv2Uk5OjTZs2ufjpqp7M9Fx/hxDQImuEMs0WAAD4RUCMzNvugVF5khQSZvNIPb4UEnbu0QrterZWo9QGPogGAAAX2CSZnCwnqUEDx9+y8ePH68knnzyj+NixY5WTk6OWLVvKYrHIarVq0qRJuummm+xl0tLSdNVVV6lx48bauXOnHnvsMfXv31+//vqrLBaLU+EfPnzYIZEnyX5++PBhp+qoyjL3ksw7m9QeSTJbAuK9OAAAqGYCIpl3eE+mR+oJCqp8u+UFBZ875kHD+/kgEgAAXGMyDJmcmEJ7usy+ffsUExNjv/7XNe5O+/jjjzV79my9//77Sk1N1dq1azVy5EglJSVp6NChkqQhQ4bYy7dt21bt2rVTSkqKFi9erD59+lTkY+G/crLYxbY8ZotJrbsn+TsMAABQTQVEMq+4grvYnlZSbJb99X8lUVJ09iENNRPjdOGVXXwUDQAA3hMTE+OQzCvPmDFjNHbsWHvCrm3bttq7d68mT55sT+b9VZMmTVS7dm3t2LHD6WReQkLCGevwHTlyxH6vurOWWP0dQsBqfF68ImPLTkYDAAB4W0DMDbCWeqazWJAXEB/HJQX5Z4/55ieuVVBwQORcAQBwdHoDDGcOF+Tn58tsdvx9tFgsstnKf2G3f/9+ZWVlKTEx0el2unbtqg0bNigjI8N+bcGCBYqJiVHr1q1dirkqstkq34wHXzAHmdR5QCN/hwEAAKqxgMgSBYcGe6SevdvClNS4cm2CsWdLWLn3Ol7SVpfffakPowEAwAU2QzI5kfBxMSk0cOBATZo0SQ0bNlRqaqrWrFmjqVOn6rbbbpMk5eXlacKECbr66quVkJCgnTt36uGHH1bTpk3Vr9//lqZIT0/XsWPHlJ6eLqvVqrVr10qSmjZtqqioKPXt21etW7fWzTffrGeffVaHDx/WP//5T917773lTgGuTixBle8lqS90HtBYtepF+TsMAABQjQVEMi/IQ8m87evD1bVfjkfq8pUdGyLKvB4RHa5Rbw73cTQAALjA2VF3Lo7MmzZtmsaNG6cRI0YoIyNDSUlJuvvuu/XEE09IOjVKb/369Zo1a5ZOnDihpKQk9e3bVxMnTnRIwj3xxBOaNWuW/bxDhw6SpEWLFqlXr16yWCyaO3euhg8frq5duyoyMlJDhw7Vv/71L5firaoYmXemOsnR6tgv2d9hAACAai4gknkbftrkkXq2ry87MRaoDu4O0cmcsnfcu+u5m1U3Od7HEQEA4H/R0dF66aWX9NJLL5V5Pzw8XPPmzTtnPTNnztTMmTPPWiY5OVnffvutG1FWbTarTUf3sZvtn1mCzbr4llYym53ZwhkAAMB7/J7M2/jLFi35aJlH6tq+Ptwj9fjKtnKSj9eMGqgBdzG9FgAQ6JxdD48RXpXN6nnpOnmici1d4k1ms0n97khlei0AAAgIfk3mFRcW64Xb/+2xaRwdeuTJWipZ/J6idM6v887c0e/yuy/V3c/f4odoAABwkZem2cK/sg7m6fdvd/s7jIBhNpvUZ1grNT6PGRMAACAw+DXtteiDpdq/7VCF64mLL9EDz+6vVOvlHcsI0tJvYh2uXTfmCt055e9+iggAAEBa+e0e2UpJwEqnNgHpe0eqmrQnkQcAAAKHX5N53769sMJ1NGpZoKc/2KVadUs9EJHvzPugpkpLTu0SFx4VpofeHqGe13b1c1QAALjAZsipKbRspFBp5OcUa9eaTH+HERDiEiLU9442ql2fqbUAACCw+DWZt2fDPgWZ3N/Jtvl5+Xr6/V2KjrN6MCrvs5ZK3/ynlsO1Vn9r5qdoAABwk2E7dThTDpXClt8OymYl+SpJQSFm1UqK9HcYAAAAZzD7OwB3JTcv1KTZlS+RJ51K5GUeCLGfF+QV6vOXvvFjRAAAANKWZYf9HULAyEzP095NWf4OAwAA4AyVMpkXGWPVU7N3KaZm5UvkHdoborcmJp5xff6sxSoqKPJDRAAAuOn0BhjOHKgU8k7QF/mzjUsO+DsEAACAM1TKZN49Ew6oTr0Sf4fhMptNmjqqgYoKLGfcyz2Wp0UfLvNDVAAAuMlmOH8AlVD6pizlHC3wdxgAAAAOKl0yr/PFOep7/XF/h+GWr2fW0vpfy19Eee5r83wYDQAAAM7GMKRNPzM6DwAABJZKlcyLiLbqgWf3+zsMt/y2IEavja931jJbf9+pnGO5PooIAIAKYpotqoG9m475OwQAAAAHlSqZN2jYUcX/f3v3HxT1fedx/LXsD1hxWdwArkRAQMECh0FraWJMsCEGo+kdyUwqitGStKn3I2Njctcfnk0TM502mfPumjR/9PiRuYvJaM9zziYznbR6R35NZjxKUtsmarFRgtYai8iPlR/7vT867pVjgd2V5btfeD5mvn/w3c9+eX8Zx33z4vP5frKtt7z22H/N1Z4v5yk4Ypt07Mn/6ZiGigAAmAKGIgzzzC4UiN0fuvo0PGS95zQDAICZyzJhXlKSobu3WG9HsTd+7NW3tuZr6GpkP+oTxwjzAAAWwcw8zALBoKGLnb1mlwEAABDiMLuASFXe2aP5C60zKy/Qn6Tm7/h1qDFD0uQz8q452fab+BUFAACAqP3+oyvy53vNLgMAAECShcK8DVsvml1CxD4+7dKuzQXq+m1y1O89+2FXHCoCACAOgkFJwQjHAdbV/bt+s0sAAAAIsUSYl+weUcVq6yxvOPeRM6YgT5Ku9g9OcTUAAMRJpEtoWWYLixse5Jl5AAAgcVjimXmFZQHZ7WZXEbkl5YGY3zs8ODyFlQAAAOB6jYwQSAMAgMRhiTBvSbm1ljZ4fSPKujG2GXYOlyUmSwIAwAYYmDXs9siffwwAABBvlgjzisoHzC4harEGkMlzXFNcCQAAcRI0Ij8AC3O4LLREBAAAzHiWCPPyimNftmqWRUtjqzmnOHuKKwEAAMD1SJ8/x+wSAAAAQiyxpjMl1Xq74MVa85LlhVNcCQAA8WEYQRnG5J93kYxBYkh2OyT2ehgjM89jdgkAAAAhlpiZ53RZ75cApyu2JUVFny6Y4koAAIgTI8IltjwzzzKKVvrNLiHhJCXZlLFwrtllAAAAhJga5mXlZUQ0bmTIeg8dHhmOreYlKwjzAACAOT516wLJem1XXM3LTpXDyTPzAABA4jA1zLtr25qIxl0NWGIC4ShXB6LvhItXFirNxzIOAIBFsJvtjJOeNUc5S+eZXUZCySv1mV0CAADAKKamZGu3VcmbMXl4df6M9XZ4PX8mOer33LP9rjhUAgBAnASDkR+wjOU1i8wuIWHYbFLp6hvNLgMAAGAUU8O8NJ9Hf/39Bycdd/J96+0gdvJ9d1TjPb65WrNxVZyqAQAgDpiZNyMtLJ6nktXZZpeREHLLblBaRnQ9HQAAQLyZvn616gurtPq+ygnHRBuMmS0wYNNHJ1Kies/arVVypVhvBiIAAJh5Vt23WB5fdL3MTFR2G7PyAABA4jE9zJOkv3n+SxMut7VamNfxS7eCI5E/M2+Ox617d6yPY0UAAEw9IxiM+IC1uFIcWrNlqdllmCorz6O80hvMLgMAAGCMhAjz5mV5tftHjynZHX5mWvdFpz76MPpn0Jnl/bfnRjX+4WcfUFZOZDv7AgCQMFhmO6PlfMqnm2sLzS7DFHZHku7YWiJbElv7AgCAxJMQYZ4kld9Wor8/sFNOlyPs6z/+V2v8ZXRkRHrt3yKvdfmd5br7S9VxrAgAACA2y+/K04qaPLPLmHYrNyySLzvV7DIAAADCSpgwT5Iq716upw5/TSmpY2fh/XS/TwN9CVVuWMeOePS7zsiefTcnza2dP/xKnCsCACBOgkbkByzrs39RqMrP55tdxrTJyvOoYu3sCzABAIB1JFw6tuLOZdrb+pQWleWMOt/fa9eRg+nmFBWFwy9GtlzW6XLoWz96TFm5mXGuCACAODEMyQhGcBDmWd2n787XnQ+WKDk1/AqKmcLjS9G6r/yZklheCwAAEljChXmStLgiXz849l3Vfb1Wdoc9dP4/fpipwauJ21x1/DJFx46Ov5HHNQ6nXd94eYeWV5dPQ1UAAADXr2ilX3W7K7WofGY+5zfV69Lnd9ykufPYxRcAACS2hAzzJMnpcqrh6U3653eeVsnNRZKks6dStG/vfJMrC294SHr2qzkyjInDxmS3S98+9He6tbZymioDACA+jKAR8YGZIdWbrPV/Wa7qL5bI45va0CspOKSM378n+/DAlF43EmmZbt37+AqlZ82Z9u8NAAAQrYRfK1G0olD/9NbTOtnWof/8wU90qOkN3bLusoqWTX+jN5FXvj9fvzk+cQNYUJ6nx1v+Sotvmj3PnQEAzGBGUFIwwnGYSYor/SpaOV+//cVFHf/vj3XmV5divlbKwEXd2PWGFpx/R66hPvW5s/TrpVvU4y2YworHV7g8U7fXFcvtieyZxwAAAGZL+DDvmiXLC7TzX7br4WcfUNtP/l0Fw9+RwzFidlmS/ri8dt8/jj9j0O6wq+7rtdq86z45nJb5kQMAAIzLlmRT/rJM5S/LVPeFfv2q5XV9/M6HuuLJ1YA7Q7KFXwDiGOqT58oZeXrPal73Sfku/Vo2/d/szdSBC1rx83/QmZw7dHrRegXt8QnZ3B6nbttYrMUrsuJyfQAAgHixXLI0Nz1Vt33hARkDHhmXvybJ3KU7ly449NSXF2lkOPzy2pvWlOrhZ7dqcQWz8QAAM4sRNGTYJv8cNtgAY8ZLz5qjW/72z9X1jW/q8sFva9ieot7UbI3YkxVMcspmjCgpOCR34BO5A59Mej2bDOWd/akyPvmFThXeq098JeOGg9GyO5K05DPzdUttIbPxAACAJZkS5l1r6nt6eq7jKnfIMB6V0fu9qSkqBle67dr9QI7OdCRJGgqdn+N1a83GW1Xz4OeUW3yjpOu9VwAApt+1z67xwrhh42pES2iH/+QzEolpanozKfXxx3T50iUFfvYzOQZPjmk0RyT1RnPBK10qbH9O2Sk+nfPfovP+z2jImRpTbWk3pGjpzQtU/Fm/3HNdGjICGuoJxHQtAADMMll/htnBZpjwL6Czs1M5OTnT/W0BAEAMzp49q4ULF4a+DgQCys/P1/nz5yO+ht/v1+nTp5WSwk6hiYjeDAAAa/n//RlmF1PCvGAwqK6uLnk8HtlsE+/+CgAAzGEYhq5cuaLs7GwlJY1e4hgIBDQ4OBjxtVwuF0FeAqM3AwDAGibqzzB7mBLmAQAAAAAAAIgeMS4AAAAAAABgEYR5AAAAAAAAgEUQ5gEAAAAAAAAWQZgHAAAAAAAAWARhHjDLVFVVaceOHWPOt7S0KD09XZL0xBNPyGazqaamZsy4Z555RjabTVVVVWNe6+zslMvlUllZWdjvbbPZQofX69WqVat05MiR0Outra265557lJ2dLZvNpkOHDsVyiwAAAJZBbwYAiBZhHoCwFixYoKNHj6qzs3PU+aamJuXm5oZ9T0tLi+6//3719PTo3XffDTumublZ586d01tvvaWMjAxt2LBBHR0dkqS+vj4tW7ZMzz///NTeDAAAgMXRmwEAriHMAxBWVlaW1q5dqxdffDF07u2339bFixe1fv36MeMNw1Bzc7O2bNmiTZs2qbGxMex109PT5ff7VVZWphdeeEEDAwN6/fXXJUnr1q3Tnj17VFtbG5+bAgAAsCh6MwDANYR5AMbV0NCglpaW0NdNTU3avHmzXC7XmLFHjx5Vf3+/qqurVV9fr1deeUV9fX0TXt/tdkuSBgcHp7RuAACAmYjeDAAgEeYBmMCGDRvU09Oj1tZW9fX1af/+/WpoaAg7trGxURs3bpTdbldZWZkKCgp04MCBca/d39+vXbt2yW636/bbb4/XLQAAAMwY9GYAAElymF0AgMTldDpVX1+v5uZmdXR0qKioSOXl5WPGdXd36+DBg3rzzTdD5+rr69XY2Kht27aNGltXVye73a6BgQFlZmaqsbEx7DUBAAAwGr0ZAEAizANmnbS0NF2+fHnM+e7ubnm93jHnGxoaVFlZqePHj4/7l999+/YpEAiosrIydM4wDAWDQZ04cUJFRUWh83v37lV1dbW8Xq8yMzOn4I4AAACsi94MABAtltkCs0xxcbHa2trGnG9raxvV2F1TWlqq0tJSHT9+XJs2bQp7zcbGRu3cuVPt7e2h47333tPq1avV1NQ0aqzf79fixYtpFgEAAERvBgCIHjPzgFlm+/bteu655/TII4/ooYceUnJysl599VW9/PLLOnz4cNj3HDlyRENDQ0pPTx/zWnt7u9ra2vTSSy9p6dKlo16rq6vTk08+qT179sjhmPy/m97eXp06dSr09enTp9Xe3i6fz6fc3NzobhQAAMAC6M0AANFiZh4wyxQUFKi1tVUffPCBqqurVVlZqf379+vAgQOqqakJ+57U1NSwzaL0x7/8lpSUjGkWJam2tlYXLlzQa6+9FlFtx44dU0VFhSoqKiRJjz76qCoqKrR79+7Ibg4AAMBi6M0AANGyGYZhmF0EAAAAAAAAgMkxMw8AAAAAAACwCMI8AAAAAAAAwCII8wAAAAAAAACLIMwDAAAAAAAALIIwDwAAAAAAALAIwjwAAAAAAADAIgjzAAAAAAAAAIsgzAMAAAAAAAAsgjAPAAAAAAAAsAjCPAAAAAAAAMAiCPMAAAAAAAAAiyDMAwAAAAAAACzifwE00MCs0PUzbwAAAABJRU5ErkJggg==",
+      "text/plain": [
+       "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "ep.pp.neighbors(edata, use_rep=\"saits_latent\")\n", + "ep.tl.umap(edata)\n", + "ep.pl.umap(edata, color=[\"gender_concept_id\", \"race_source_value\"])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "ehrapy_venv_oct", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 929644a2c87bc3d15001e932472e57eab3dd5ee2 Mon Sep 17 00:00:00 2001 From: Eljas Roellin Date: Thu, 21 Nov 2024 09:08:32 +0100 Subject: [PATCH 31/43] add pypots to tutorial index --- docs/index.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/index.md b/docs/index.md index da01b06..7065d92 100644 --- a/docs/index.md +++ b/docs/index.md @@ -16,4 +16,5 @@ notebooks/omop_tables_tutorial notebooks/cohort_definition notebooks/study_design_example_omop_cdm notebooks/indwelling_arterial_catheters +notebooks/tutorial_time_series_with_pypots ``` From 3746c24bc600861342ae273d48a1ca35974098a9 Mon Sep 17 00:00:00 2001 From: Eljas Roellin Date: Thu, 21 Nov 2024 18:15:00 +0100 Subject: [PATCH 32/43] add one-hot presence encoding for a feature --- src/ehrdata/io/omop/_queries.py | 15 +++++++++++---- src/ehrdata/io/omop/omop.py | 2 +- tests/test_io/test_omop.py | 6 +++--- 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/src/ehrdata/io/omop/_queries.py b/src/ehrdata/io/omop/_queries.py index 2975231..c4b24f8 100644 --- a/src/ehrdata/io/omop/_queries.py +++ b/src/ehrdata/io/omop/_queries.py @@ -77,8 +77,11 @@ def _drop_timedeltas(backend_handle: duckdb.duckdb.DuckDBPyConnection): def _generate_value_query(data_table: str, data_field_to_keep: Sequence, aggregation_strategy: str) -> str: - query = f"{', ' .join([f'CASE WHEN COUNT(*) = 0 THEN NULL ELSE {aggregation_strategy}({column}) END AS {column}' for column in data_field_to_keep])}" - return query + # is_present is 1 in all rows of the data_table; but need an aggregation operation, so use LAST + is_present_query = "LAST(is_present) as is_present, " + value_query = f"{', ' .join([f'{aggregation_strategy}({column}) AS {column}' for column in data_field_to_keep])}" + + return is_present_query + value_query def time_interval_table_query_long_format( @@ -137,10 +140,14 @@ def time_interval_table_query_long_format( SELECT person_id, data_table_concept_id, interval_step, start_date, start_date + interval_start_offset as interval_start, start_date + interval_end_offset as interval_end \ FROM long_format_backbone \ CROSS JOIN timedeltas \ + ), \ + data_table_with_presence_indicator as( \ + SELECT *, 1 as is_present \ + FROM {data_table} \ ) \ - SELECT lfi.person_id, lfi.data_table_concept_id, interval_step, interval_start, interval_end, {_generate_value_query(data_table, data_field_to_keep, AGGREGATION_STRATEGY_KEY[aggregation_strategy])} \ + SELECT lfi.person_id, lfi.data_table_concept_id, interval_step, interval_start, interval_end, {_generate_value_query("data_table_with_presence_indicator", data_field_to_keep, AGGREGATION_STRATEGY_KEY[aggregation_strategy])} \ FROM long_format_intervals as lfi \ - LEFT JOIN {data_table} ON lfi.person_id = {data_table}.person_id AND lfi.data_table_concept_id = {data_table}.{DATA_TABLE_CONCEPT_ID_TRUNK[data_table]}_concept_id AND {data_table}.{data_table}_{date_prefix}date BETWEEN lfi.interval_start AND lfi.interval_end \ + LEFT JOIN data_table_with_presence_indicator ON lfi.person_id = data_table_with_presence_indicator.person_id AND lfi.data_table_concept_id = data_table_with_presence_indicator.{DATA_TABLE_CONCEPT_ID_TRUNK[data_table]}_concept_id AND data_table_with_presence_indicator.{data_table}_{date_prefix}date BETWEEN lfi.interval_start AND lfi.interval_end \ GROUP BY lfi.person_id, lfi.data_table_concept_id, interval_step, interval_start, interval_end """ ).df() diff --git a/src/ehrdata/io/omop/omop.py b/src/ehrdata/io/omop/omop.py index 876b9d2..8c67c31 100644 --- a/src/ehrdata/io/omop/omop.py +++ b/src/ehrdata/io/omop/omop.py @@ -275,7 +275,7 @@ def setup_variables( data_tables The table to be used. Only a single table can be used. data_field_to_keep - The CDM Field in the data table to be kept. Can be e.g. "value_as_number" or "value_as_concept_id". + The CDM Field in the data table to be kept. Can be e.g. "value_as_number" or "value_as_concept_id". Importantly, can be "is_present" to have a one-hot encoding of the presence of the feature in a patient in an interval. start_time Starting time for values to be included. interval_length_number diff --git a/tests/test_io/test_omop.py b/tests/test_io/test_omop.py index 41a99ba..62773e9 100644 --- a/tests/test_io/test_omop.py +++ b/tests/test_io/test_omop.py @@ -92,9 +92,9 @@ def test_setup_obs_invalid_observation_table_value(omop_connection_vanilla): @pytest.mark.parametrize( "data_tables,data_field_to_keep", [ - (["measurement"], ["value_as_number", "value_as_concept_id"]), - (["observation"], ["value_as_number", "value_as_concept_id"]), - (["specimen"], ["quantity"]), + (["measurement"], ["value_as_number", "value_as_concept_id", "is_present"]), + (["observation"], ["value_as_number", "value_as_concept_id", "is_present"]), + (["specimen"], ["quantity", "is_present"]), ], ) @pytest.mark.parametrize( From ac0c89b8130506d1de3f2f3b14a24f04d959160c Mon Sep 17 00:00:00 2001 From: Eljas Roellin Date: Thu, 21 Nov 2024 19:16:29 +0100 Subject: [PATCH 33/43] add first basic interval-variable support for drug_exposure --- src/ehrdata/io/omop/_queries.py | 77 +++++++++++++++++++++++++++++++++ src/ehrdata/io/omop/omop.py | 52 ++++++++++++++-------- tests/test_io/test_omop.py | 15 ++++--- 3 files changed, 119 insertions(+), 25 deletions(-) diff --git a/src/ehrdata/io/omop/_queries.py b/src/ehrdata/io/omop/_queries.py index c4b24f8..080c8e3 100644 --- a/src/ehrdata/io/omop/_queries.py +++ b/src/ehrdata/io/omop/_queries.py @@ -155,3 +155,80 @@ def time_interval_table_query_long_format( _drop_timedeltas(backend_handle) return df + + +def time_interval_table_for_interval_tables_query_long_format( + backend_handle: duckdb.duckdb.DuckDBPyConnection, + time_defining_table: str, + data_table: str, + interval_length_number: int, + interval_length_unit: str, + num_intervals: int, + aggregation_strategy: str, + data_field_to_keep: Sequence[str] | str, + date_prefix: str = "", +) -> pd.DataFrame: + """Returns a long format DataFrame from the data_table. The following columns should be considered the indices of this long format: person_id, data_table_concept_id, interval_step. The other columns, except for start_date and end_date, should be considered the values.""" + if isinstance(data_field_to_keep, str): + data_field_to_keep = [data_field_to_keep] + + if date_prefix != "": + date_prefix = date_prefix + "_" + + timedeltas_dataframe = _generate_timedeltas(interval_length_number, interval_length_unit, num_intervals) + + _write_timedeltas_to_db( + backend_handle, + timedeltas_dataframe, + ) + + # multi-step query + # 1. Create person_time_defining_table, which matches the one created for obs. Needs to contain the person_id, and the start date in particular. + # 2. Create person_data_table (data_table is typically measurement), which contains the cross product of person_id and the distinct concept_id s. + # 3. Create long_format_backbone, which is the left join of person_time_defining_table and person_data_table. + # 4. Create long_format_intervals, which is the cross product of long_format_backbone and timedeltas. This table contains most notably the person_id, the concept_id, the interval start and end dates. + # 5. Create the final table, which is the join with the data_table (typically measurement); each measurement is assigned to its person_id, its concept_id, and the interval it fits into. + df = backend_handle.execute( + f""" + WITH person_time_defining_table AS ( \ + SELECT person.person_id as person_id, {START_DATE_KEY[time_defining_table]} as start_date, {END_DATE_KEY[time_defining_table]} as end_date \ + FROM person \ + JOIN {time_defining_table} ON person.person_id = {time_defining_table}.{TIME_DEFINING_TABLE_SUBJECT_KEY[time_defining_table]} \ + ), \ + person_data_table AS( \ + WITH distinct_data_table_concept_ids AS ( \ + SELECT DISTINCT {DATA_TABLE_CONCEPT_ID_TRUNK[data_table]}_concept_id + FROM {data_table} \ + ) + SELECT person.person_id, {DATA_TABLE_CONCEPT_ID_TRUNK[data_table]}_concept_id as data_table_concept_id \ + FROM person \ + CROSS JOIN distinct_data_table_concept_ids \ + ), \ + long_format_backbone as ( \ + SELECT person_time_defining_table.person_id, data_table_concept_id, start_date, end_date \ + FROM person_time_defining_table \ + LEFT JOIN person_data_table USING(person_id)\ + ), \ + long_format_intervals as ( \ + SELECT person_id, data_table_concept_id, interval_step, start_date, start_date + interval_start_offset as interval_start, start_date + interval_end_offset as interval_end \ + FROM long_format_backbone \ + CROSS JOIN timedeltas \ + ), \ + data_table_with_presence_indicator as( \ + SELECT *, 1 as is_present \ + FROM {data_table} \ + ) \ + SELECT lfi.person_id, lfi.data_table_concept_id, interval_step, interval_start, interval_end, {_generate_value_query("data_table_with_presence_indicator", data_field_to_keep, AGGREGATION_STRATEGY_KEY[aggregation_strategy])} \ + FROM long_format_intervals as lfi \ + LEFT JOIN data_table_with_presence_indicator ON lfi.person_id = data_table_with_presence_indicator.person_id \ + AND lfi.data_table_concept_id = data_table_with_presence_indicator.{DATA_TABLE_CONCEPT_ID_TRUNK[data_table]}_concept_id \ + AND (data_table_with_presence_indicator.{data_table}_start_date BETWEEN lfi.interval_start AND lfi.interval_end \ + OR data_table_with_presence_indicator.{data_table}_end_date BETWEEN lfi.interval_start AND lfi.interval_end \ + OR (data_table_with_presence_indicator.{data_table}_start_date < lfi.interval_start AND data_table_with_presence_indicator.{data_table}_end_date > lfi.interval_end)) \ + GROUP BY lfi.person_id, lfi.data_table_concept_id, interval_step, interval_start, interval_end + """ + ).df() + + _drop_timedeltas(backend_handle) + + return df diff --git a/src/ehrdata/io/omop/omop.py b/src/ehrdata/io/omop/omop.py index 8c67c31..176f9af 100644 --- a/src/ehrdata/io/omop/omop.py +++ b/src/ehrdata/io/omop/omop.py @@ -32,6 +32,7 @@ _check_valid_variable_data_tables, ) from ehrdata.io.omop._queries import ( + time_interval_table_for_interval_tables_query_long_format, time_interval_table_query_long_format, ) from ehrdata.utils._omop_utils import get_table_catalog_dict @@ -396,7 +397,7 @@ def setup_interval_variables( *, backend_handle: duckdb.duckdb.DuckDBPyConnection, data_tables: Sequence[Literal["drug_exposure"]] | Literal["drug_exposure"], - data_field_to_keep: str | Sequence[str] | Literal["one-hot"], + data_field_to_keep: str | Sequence[str], interval_length_number: int, interval_length_unit: str, num_intervals: int, @@ -421,7 +422,7 @@ def setup_interval_variables( data_tables The table to be used. Only a single table can be used. data_field_to_keep - The CDM Field in the data table to be kept. Can be e.g. "value_as_number" or "value_as_concept_id". + The CDM Field in the data table to be kept. Can be e.g. "value_as_number" or "value_as_concept_id". Importantly, can be "is_present" to have a one-hot encoding of the presence of the feature in a patient in an interval. start_time Starting time for values to be included. interval_length_number @@ -469,24 +470,37 @@ def setup_interval_variables( return edata if keep_date == "start" or keep_date == "end": - date_prefix = keep_date - else: - raise NotImplementedError("support interval extraction coming soon") - ds = ( - time_interval_table_query_long_format( - backend_handle=backend_handle, - time_defining_table=time_defining_table, - data_table=data_tables[0], - data_field_to_keep=data_field_to_keep, - interval_length_number=interval_length_number, - interval_length_unit=interval_length_unit, - num_intervals=num_intervals, - aggregation_strategy=aggregation_strategy, - date_prefix=date_prefix, + ds = ( + time_interval_table_for_interval_tables_query_long_format( + backend_handle=backend_handle, + time_defining_table=time_defining_table, + data_table=data_tables[0], + data_field_to_keep=data_field_to_keep, + interval_length_number=interval_length_number, + interval_length_unit=interval_length_unit, + num_intervals=num_intervals, + aggregation_strategy=aggregation_strategy, + date_prefix=keep_date, + ) + .set_index(["person_id", "data_table_concept_id", "interval_step"]) + .to_xarray() + ) + elif keep_date == "interval": + ds = ( + time_interval_table_for_interval_tables_query_long_format( + backend_handle=backend_handle, + time_defining_table=time_defining_table, + data_table=data_tables[0], + data_field_to_keep=data_field_to_keep, + interval_length_number=interval_length_number, + interval_length_unit=interval_length_unit, + num_intervals=num_intervals, + aggregation_strategy=aggregation_strategy, + date_prefix=keep_date, + ) + .set_index(["person_id", "data_table_concept_id", "interval_step"]) + .to_xarray() ) - .set_index(["person_id", "data_table_concept_id", "interval_step"]) - .to_xarray() - ) var = ds["data_table_concept_id"].to_dataframe() diff --git a/tests/test_io/test_omop.py b/tests/test_io/test_omop.py index 62773e9..706772e 100644 --- a/tests/test_io/test_omop.py +++ b/tests/test_io/test_omop.py @@ -139,23 +139,26 @@ def test_setup_variables( @pytest.mark.parametrize( "observation_table", - [ - "person_cohort", - ], # "person_observation_period", "person_visit_occurrence"], + ["person_cohort", "person_observation_period", "person_visit_occurrence"], ) @pytest.mark.parametrize( "data_tables,data_field_to_keep", [ - (["drug_exposure"], ["days_supply"]), # ["one-hot"] + (["drug_exposure"], ["days_supply"]), + (["drug_exposure"], ["is_present"]), + # (["condition_occurrence"], ["is_present"]), # TODO: write test file + # (["procedure_occurrence"], ["is_present"]), # TODO: write test file + # (["device_exposure"], ["is_present"]), # TODO: write test file + # (["note"], ["is_present"]), ], ) @pytest.mark.parametrize( "enrich_var_with_feature_info", - [False], # True, + [False, True], ) @pytest.mark.parametrize( "keep_date", - ["start", "end"], # "interval" + ["start", "end", "interval"], ) def test_setup_interval_variables( omop_connection_vanilla, From 0933889cd7f3dd14c72d0b2dedb040bf98b9d7be Mon Sep 17 00:00:00 2001 From: Eljas Roellin Date: Thu, 21 Nov 2024 19:48:11 +0100 Subject: [PATCH 34/43] add strict tests for setup_variables --- tests/test_io/test_omop.py | 64 +++++++++++++++++++++++++++++++++++--- 1 file changed, 60 insertions(+), 4 deletions(-) diff --git a/tests/test_io/test_omop.py b/tests/test_io/test_omop.py index 706772e..c43e57f 100644 --- a/tests/test_io/test_omop.py +++ b/tests/test_io/test_omop.py @@ -1,5 +1,6 @@ import re +import numpy as np import pytest import ehrdata as ed @@ -89,12 +90,64 @@ def test_setup_obs_invalid_observation_table_value(omop_connection_vanilla): "observation_table", ["person_cohort", "person_observation_period", "person_visit_occurrence"], ) +# test 1 field from table, and is_present encoding @pytest.mark.parametrize( - "data_tables,data_field_to_keep", + "data_tables,data_field_to_keep,target_r", [ - (["measurement"], ["value_as_number", "value_as_concept_id", "is_present"]), - (["observation"], ["value_as_number", "value_as_concept_id", "is_present"]), - (["specimen"], ["quantity", "is_present"]), + ( + ["measurement"], + ["value_as_number"], + [ + [[np.nan, np.nan, np.nan, np.nan], [18.0, np.nan, np.nan, np.nan]], + [[np.nan, np.nan, np.nan, np.nan], [20.0, np.nan, np.nan, np.nan]], + [[np.nan, np.nan, np.nan, np.nan], [22.0, np.nan, np.nan, np.nan]], + ], + ), + ( + ["measurement"], + ["is_present"], + [ + [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], + [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], + [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], + ], + ), + ( + ["observation"], + ["value_as_number"], + [ + [[np.nan, np.nan, np.nan, np.nan], [3, np.nan, np.nan, np.nan]], + [[np.nan, np.nan, np.nan, np.nan], [4, np.nan, np.nan, np.nan]], + [[np.nan, np.nan, np.nan, np.nan], [5, np.nan, np.nan, np.nan]], + ], + ), + ( + ["observation"], + ["is_present"], + [ + [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], + [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], + [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], + ], + ), + ( + ["specimen"], + ["quantity"], + [ + [[0.5, np.nan, np.nan, np.nan], [1.5, np.nan, np.nan, np.nan]], + [[0.5, np.nan, np.nan, np.nan], [1.5, np.nan, np.nan, np.nan]], + [[0.5, np.nan, np.nan, np.nan], [1.5, np.nan, np.nan, np.nan]], + ], + ), + ( + ["specimen"], + ["is_present"], + [ + [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], + [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], + [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], + ], + ), ], ) @pytest.mark.parametrize( @@ -112,6 +165,7 @@ def test_setup_variables( data_field_to_keep, enrich_var_with_feature_info, enrich_var_with_unit_info, + target_r, ): num_intervals = 4 con = omop_connection_vanilla @@ -136,6 +190,8 @@ def test_setup_variables( VAR_DIM_UNIT_INFO if enrich_var_with_unit_info else 0 ) + assert np.allclose(edata.r, np.array(target_r), equal_nan=True) + @pytest.mark.parametrize( "observation_table", From 5b387062cdb7d634633e61737c45d768f98bf78e Mon Sep 17 00:00:00 2001 From: Eljas Roellin Date: Thu, 21 Nov 2024 20:00:27 +0100 Subject: [PATCH 35/43] add tight test for interval vars drug_exposure table --- src/ehrdata/io/omop/omop.py | 2 +- tests/test_io/test_omop.py | 74 +++++++++++++++++++++++++++++++++---- 2 files changed, 67 insertions(+), 9 deletions(-) diff --git a/src/ehrdata/io/omop/omop.py b/src/ehrdata/io/omop/omop.py index 176f9af..85d8de7 100644 --- a/src/ehrdata/io/omop/omop.py +++ b/src/ehrdata/io/omop/omop.py @@ -471,7 +471,7 @@ def setup_interval_variables( if keep_date == "start" or keep_date == "end": ds = ( - time_interval_table_for_interval_tables_query_long_format( + time_interval_table_query_long_format( backend_handle=backend_handle, time_defining_table=time_defining_table, data_table=data_tables[0], diff --git a/tests/test_io/test_omop.py b/tests/test_io/test_omop.py index c43e57f..1ef28c9 100644 --- a/tests/test_io/test_omop.py +++ b/tests/test_io/test_omop.py @@ -197,11 +197,70 @@ def test_setup_variables( "observation_table", ["person_cohort", "person_observation_period", "person_visit_occurrence"], ) +# test 1 field from table, and is_present encoding, with start, end, and interval @pytest.mark.parametrize( - "data_tables,data_field_to_keep", + "data_tables,data_field_to_keep,keep_date,target_r", [ - (["drug_exposure"], ["days_supply"]), - (["drug_exposure"], ["is_present"]), + ( + ["drug_exposure"], + ["days_supply"], + "start", + [ + [[31.0, np.nan, np.nan, np.nan], [31.0, np.nan, np.nan, np.nan]], + [[31.0, np.nan, np.nan, np.nan], [31.0, np.nan, np.nan, np.nan]], + [[31.0, np.nan, np.nan, np.nan], [31.0, np.nan, np.nan, np.nan]], + ], + ), + ( + ["drug_exposure"], + ["days_supply"], + "end", + [ + [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], + [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], + [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], + ], + ), + ( + ["drug_exposure"], + ["days_supply"], + "interval", + [ + [[31.0, 31.0, 31.0, 31.0], [31.0, 31.0, 31.0, 31.0]], + [[31.0, 31.0, 31.0, 31.0], [31.0, 31.0, 31.0, 31.0]], + [[31.0, 31.0, 31.0, 31.0], [31.0, 31.0, 31.0, 31.0]], + ], + ), + ( + ["drug_exposure"], + ["is_present"], + "start", + [ + [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], + [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], + [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], + ], + ), + ( + ["drug_exposure"], + ["is_present"], + "end", + [ + [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], + [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], + [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], + ], + ), + ( + ["drug_exposure"], + ["is_present"], + "interval", + [ + [[1, 1, 1, 1], [1, 1, 1, 1]], + [[1, 1, 1, 1], [1, 1, 1, 1]], + [[1, 1, 1, 1], [1, 1, 1, 1]], + ], + ), # (["condition_occurrence"], ["is_present"]), # TODO: write test file # (["procedure_occurrence"], ["is_present"]), # TODO: write test file # (["device_exposure"], ["is_present"]), # TODO: write test file @@ -212,15 +271,12 @@ def test_setup_variables( "enrich_var_with_feature_info", [False, True], ) -@pytest.mark.parametrize( - "keep_date", - ["start", "end", "interval"], -) -def test_setup_interval_variables( +def test_setup_interval_type_variables( omop_connection_vanilla, observation_table, data_tables, data_field_to_keep, + target_r, enrich_var_with_feature_info, keep_date, ): @@ -245,6 +301,8 @@ def test_setup_interval_variables( assert edata.r.shape[2] == num_intervals assert edata.var.shape[1] == VAR_DIM_BASE + (VAR_DIM_FEATURE_INFO if enrich_var_with_feature_info else 0) + assert np.allclose(edata.r, np.array(target_r), equal_nan=True) + @pytest.mark.parametrize( "edata, backend_handle, data_tables, data_field_to_keep, interval_length_number, interval_length_unit, num_intervals, enrich_var_with_feature_info, enrich_var_with_unit_info, expected_error", From 056340e0d7ce1fdd7320c0b36a31116e05b85ad1 Mon Sep 17 00:00:00 2001 From: Eljas Roellin Date: Thu, 21 Nov 2024 20:44:06 +0100 Subject: [PATCH 36/43] support condition_occurrence --- src/ehrdata/io/omop/_check_arguments.py | 2 +- src/ehrdata/io/omop/_queries.py | 17 +++-- .../toy_omop/vanilla/condition_occurrence.csv | 10 +++ tests/test_io/test_omop.py | 62 ++++++++++++++++++- 4 files changed, 85 insertions(+), 6 deletions(-) create mode 100644 tests/data/toy_omop/vanilla/condition_occurrence.csv diff --git a/src/ehrdata/io/omop/_check_arguments.py b/src/ehrdata/io/omop/_check_arguments.py index 8b145cf..cc89cae 100644 --- a/src/ehrdata/io/omop/_check_arguments.py +++ b/src/ehrdata/io/omop/_check_arguments.py @@ -12,7 +12,7 @@ VALID_OBSERVATION_TABLES_SINGLE = ["person"] VALID_OBSERVATION_TABLES_JOIN = ["person_cohort", "person_observation_period", "person_visit_occurrence"] VALID_VARIABLE_TABLES = ["measurement", "observation", "specimen"] -VALID_INTERVAL_VARIABLE_TABLES = ["drug_exposure"] +VALID_INTERVAL_VARIABLE_TABLES = ["drug_exposure", "condition_occurrence"] VALID_KEEP_DATES = ["start", "end", "interval"] diff --git a/src/ehrdata/io/omop/_queries.py b/src/ehrdata/io/omop/_queries.py index 080c8e3..99bf9c0 100644 --- a/src/ehrdata/io/omop/_queries.py +++ b/src/ehrdata/io/omop/_queries.py @@ -24,7 +24,16 @@ "observation": "observation", "specimen": "specimen", "drug_exposure": "drug", + "condition_occurrence": "condition", } +DATA_TABLE_DATE_TRUNK = { + "measurement": "measurement", + "observation": "observation", + "specimen": "specimen", + "drug_exposure": "drug_exposure", + "condition_occurrence": "condition", +} + AGGREGATION_STRATEGY_KEY = { "last": "LAST", @@ -147,7 +156,7 @@ def time_interval_table_query_long_format( ) \ SELECT lfi.person_id, lfi.data_table_concept_id, interval_step, interval_start, interval_end, {_generate_value_query("data_table_with_presence_indicator", data_field_to_keep, AGGREGATION_STRATEGY_KEY[aggregation_strategy])} \ FROM long_format_intervals as lfi \ - LEFT JOIN data_table_with_presence_indicator ON lfi.person_id = data_table_with_presence_indicator.person_id AND lfi.data_table_concept_id = data_table_with_presence_indicator.{DATA_TABLE_CONCEPT_ID_TRUNK[data_table]}_concept_id AND data_table_with_presence_indicator.{data_table}_{date_prefix}date BETWEEN lfi.interval_start AND lfi.interval_end \ + LEFT JOIN data_table_with_presence_indicator ON lfi.person_id = data_table_with_presence_indicator.person_id AND lfi.data_table_concept_id = data_table_with_presence_indicator.{DATA_TABLE_CONCEPT_ID_TRUNK[data_table]}_concept_id AND data_table_with_presence_indicator.{DATA_TABLE_DATE_TRUNK[data_table]}_{date_prefix}date BETWEEN lfi.interval_start AND lfi.interval_end \ GROUP BY lfi.person_id, lfi.data_table_concept_id, interval_step, interval_start, interval_end """ ).df() @@ -222,9 +231,9 @@ def time_interval_table_for_interval_tables_query_long_format( FROM long_format_intervals as lfi \ LEFT JOIN data_table_with_presence_indicator ON lfi.person_id = data_table_with_presence_indicator.person_id \ AND lfi.data_table_concept_id = data_table_with_presence_indicator.{DATA_TABLE_CONCEPT_ID_TRUNK[data_table]}_concept_id \ - AND (data_table_with_presence_indicator.{data_table}_start_date BETWEEN lfi.interval_start AND lfi.interval_end \ - OR data_table_with_presence_indicator.{data_table}_end_date BETWEEN lfi.interval_start AND lfi.interval_end \ - OR (data_table_with_presence_indicator.{data_table}_start_date < lfi.interval_start AND data_table_with_presence_indicator.{data_table}_end_date > lfi.interval_end)) \ + AND (data_table_with_presence_indicator.{DATA_TABLE_DATE_TRUNK[data_table]}_start_date BETWEEN lfi.interval_start AND lfi.interval_end \ + OR data_table_with_presence_indicator.{DATA_TABLE_DATE_TRUNK[data_table]}_end_date BETWEEN lfi.interval_start AND lfi.interval_end \ + OR (data_table_with_presence_indicator.{DATA_TABLE_DATE_TRUNK[data_table]}_start_date < lfi.interval_start AND data_table_with_presence_indicator.{DATA_TABLE_DATE_TRUNK[data_table]}_end_date > lfi.interval_end)) \ GROUP BY lfi.person_id, lfi.data_table_concept_id, interval_step, interval_start, interval_end """ ).df() diff --git a/tests/data/toy_omop/vanilla/condition_occurrence.csv b/tests/data/toy_omop/vanilla/condition_occurrence.csv new file mode 100644 index 0000000..0efb7e0 --- /dev/null +++ b/tests/data/toy_omop/vanilla/condition_occurrence.csv @@ -0,0 +1,10 @@ +condition_occurrence_id,person_id,condition_concept_id,condition_start_date,condition_start_datetime,condition_end_date,condition_end_datetime,condition_type_concept_id,condition_status_concept_id,stop_reason,provider_id,visit_occurrence_id,visit_detail_id,condition_source_value,condition_source_concept_id,condition_status_source_value +1,1,43530622,2100-01-01,2100-01-01 12:00:00,2100-01-31,2100-01-31 00:00:00,38000175,0,,0,31,,10,1121000119107, +2,1,43530622,2100-02-01,2100-02-01 12:00:00,2100-02-28,2100-02-28 00:00:00,38000175,0,,0,28,,10,1121000119107, +3,1,4112343,2100-01-01,2100-01-01 12:00:00,2100-01-31,2100-01-31 00:00:00,38000175,0,,0,31,,15,4112343, +4,2,43530622,2100-01-01,2100-01-01 12:00:00,2100-01-31,2100-01-31 00:00:00,38000175,0,,0,31,,10,1121000119107, +5,2,43530622,2100-02-01,2100-02-01 12:00:00,2100-02-28,2100-02-28 00:00:00,38000175,0,,0,28,,10,1121000119107, +6,2,4112343,2100-01-01,2100-01-01 12:00:00,2100-01-31,2100-01-31 00:00:00,38000175,0,,0,31,,15,4112343, +7,3,43530622,2100-01-01,2100-01-01 12:00:00,2100-01-31,2100-01-31 00:00:00,38000175,0,,0,31,,10,1121000119107, +8,3,43530622,2100-02-01,2100-02-01 12:00:00,2100-02-28,2100-02-28 00:00:00,38000175,0,,0,28,,10,1121000119107, +9,3,4112343,2100-01-01,2100-01-01 12:00:00,2100-01-31,2100-01-31 00:00:00,38000175,0,,0,31,,15,4112343, diff --git a/tests/test_io/test_omop.py b/tests/test_io/test_omop.py index 1ef28c9..6bc101d 100644 --- a/tests/test_io/test_omop.py +++ b/tests/test_io/test_omop.py @@ -16,6 +16,7 @@ "observation": 2, "specimen": 2, "drug_exposure": 2, + "condition_occurrence": 2, } # constants for setup_variables @@ -261,7 +262,66 @@ def test_setup_variables( [[1, 1, 1, 1], [1, 1, 1, 1]], ], ), - # (["condition_occurrence"], ["is_present"]), # TODO: write test file + ( + ["condition_occurrence"], + ["condition_source_value"], + "start", + [ + [[15, np.nan, np.nan, np.nan], [10, np.nan, np.nan, np.nan]], + [[15, np.nan, np.nan, np.nan], [10, np.nan, np.nan, np.nan]], + [[15, np.nan, np.nan, np.nan], [10, np.nan, np.nan, np.nan]], + ], + ), + ( + ["condition_occurrence"], + ["condition_source_value"], + "end", + [ + [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], + [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], + [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], + ], + ), + ( + ["condition_occurrence"], + ["condition_source_value"], + "interval", + [ + [[15, 15, 15, 15], [10, 10, 10, 10]], + [[15, 15, 15, 15], [10, 10, 10, 10]], + [[15, 15, 15, 15], [10, 10, 10, 10]], + ], + ), + ( + ["condition_occurrence"], + ["is_present"], + "start", + [ + [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], + [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], + [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], + ], + ), + ( + ["condition_occurrence"], + ["is_present"], + "end", + [ + [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], + [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], + [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], + ], + ), + ( + ["condition_occurrence"], + ["is_present"], + "interval", + [ + [[1, 1, 1, 1], [1, 1, 1, 1]], + [[1, 1, 1, 1], [1, 1, 1, 1]], + [[1, 1, 1, 1], [1, 1, 1, 1]], + ], + ), # (["procedure_occurrence"], ["is_present"]), # TODO: write test file # (["device_exposure"], ["is_present"]), # TODO: write test file # (["note"], ["is_present"]), From 85b0c1750e44deff2435d27065220c9fcfc86232 Mon Sep 17 00:00:00 2001 From: Eljas Roellin Date: Fri, 22 Nov 2024 12:04:25 +0100 Subject: [PATCH 37/43] support procedure_occurrence --- src/ehrdata/io/omop/_check_arguments.py | 2 +- src/ehrdata/io/omop/_queries.py | 81 +++++++++++++------ .../toy_omop/vanilla/procedure_occurrence.csv | 10 +++ tests/test_io/test_omop.py | 62 +++++++++++++- 4 files changed, 128 insertions(+), 27 deletions(-) create mode 100644 tests/data/toy_omop/vanilla/procedure_occurrence.csv diff --git a/src/ehrdata/io/omop/_check_arguments.py b/src/ehrdata/io/omop/_check_arguments.py index cc89cae..fcd2d78 100644 --- a/src/ehrdata/io/omop/_check_arguments.py +++ b/src/ehrdata/io/omop/_check_arguments.py @@ -12,7 +12,7 @@ VALID_OBSERVATION_TABLES_SINGLE = ["person"] VALID_OBSERVATION_TABLES_JOIN = ["person_cohort", "person_observation_period", "person_visit_occurrence"] VALID_VARIABLE_TABLES = ["measurement", "observation", "specimen"] -VALID_INTERVAL_VARIABLE_TABLES = ["drug_exposure", "condition_occurrence"] +VALID_INTERVAL_VARIABLE_TABLES = ["drug_exposure", "condition_occurrence", "procedure_occurrence"] VALID_KEEP_DATES = ["start", "end", "interval"] diff --git a/src/ehrdata/io/omop/_queries.py b/src/ehrdata/io/omop/_queries.py index 99bf9c0..87eab08 100644 --- a/src/ehrdata/io/omop/_queries.py +++ b/src/ehrdata/io/omop/_queries.py @@ -3,16 +3,22 @@ import duckdb import pandas as pd -START_DATE_KEY = { - "visit_occurrence": "visit_start_date", - "observation_period": "observation_period_start_date", - "cohort": "cohort_start_date", -} -END_DATE_KEY = { - "visit_occurrence": "visit_end_date", - "observation_period": "observation_period_end_date", - "cohort": "cohort_end_date", -} +# START_DATE_KEY = { +# "visit_occurrence": "visit_start_date", +# "observation_period": "observation_period_start_date", +# "cohort": "cohort_start_date", +# "drug_exposure": "drug_exposure_start_date", +# "condition_occurrence": "condition_start_date", +# "procedure_occurrence": "procedure_date", # v5.4, as v5.3 had no end! TODO: allow v5.3 too +# } +# END_DATE_KEY = { +# "visit_occurrence": "visit_end_date", +# "observation_period": "observation_period_end_date", +# "cohort": "cohort_end_date", +# "drug_exposure": "drug_exposure_end_date", +# "condition_occurrence": "condition_end_date", +# "procedure_occurrence": "procedure_end_date", # v5.4, as v5.3 had no end! TODO: allow v5.3 too +# } TIME_DEFINING_TABLE_SUBJECT_KEY = { "visit_occurrence": "person_id", "observation_period": "person_id", @@ -25,15 +31,40 @@ "specimen": "specimen", "drug_exposure": "drug", "condition_occurrence": "condition", + "procedure_occurrence": "procedure", } -DATA_TABLE_DATE_TRUNK = { - "measurement": "measurement", - "observation": "observation", - "specimen": "specimen", - "drug_exposure": "drug_exposure", - "condition_occurrence": "condition", -} +# DATA_TABLE_DATE_TRUNK = { +# "measurement": "measurement", +# "observation": "observation", +# "specimen": "specimen", +# "drug_exposure": "drug_exposure", +# "condition_occurrence": "condition", +# "procedure_occurrence": "procedure", +# } +DATA_TABLE_DATE_KEYS = { + "timepoint": { + "measurement": "measurement_date", + "observation": "observation_date", + "specimen": "specimen_date", + }, + "start": { + "visit_occurrence": "visit_start_date", + "observation_period": "observation_period_start_date", + "cohort": "cohort_start_date", + "drug_exposure": "drug_exposure_start_date", + "condition_occurrence": "condition_start_date", + "procedure_occurrence": "procedure_date", # in v5.3, procedure didnt have end date + }, + "end": { + "visit_occurrence": "visit_end_date", + "observation_period": "observation_period_end_date", + "cohort": "cohort_end_date", + "drug_exposure": "drug_exposure_end_date", + "condition_occurrence": "condition_end_date", + "procedure_occurrence": "procedure_end_date", # in v5.3, procedure didnt have end date TODO v5.3 support + }, +} AGGREGATION_STRATEGY_KEY = { "last": "LAST", @@ -108,8 +139,8 @@ def time_interval_table_query_long_format( if isinstance(data_field_to_keep, str): data_field_to_keep = [data_field_to_keep] - if date_prefix != "": - date_prefix = date_prefix + "_" + if date_prefix == "": + date_prefix = "timepoint" timedeltas_dataframe = _generate_timedeltas(interval_length_number, interval_length_unit, num_intervals) @@ -127,7 +158,7 @@ def time_interval_table_query_long_format( df = backend_handle.execute( f""" WITH person_time_defining_table AS ( \ - SELECT person.person_id as person_id, {START_DATE_KEY[time_defining_table]} as start_date, {END_DATE_KEY[time_defining_table]} as end_date \ + SELECT person.person_id as person_id, {DATA_TABLE_DATE_KEYS["start"][time_defining_table]} as start_date, {DATA_TABLE_DATE_KEYS["end"][time_defining_table]} as end_date \ FROM person \ JOIN {time_defining_table} ON person.person_id = {time_defining_table}.{TIME_DEFINING_TABLE_SUBJECT_KEY[time_defining_table]} \ ), \ @@ -156,7 +187,7 @@ def time_interval_table_query_long_format( ) \ SELECT lfi.person_id, lfi.data_table_concept_id, interval_step, interval_start, interval_end, {_generate_value_query("data_table_with_presence_indicator", data_field_to_keep, AGGREGATION_STRATEGY_KEY[aggregation_strategy])} \ FROM long_format_intervals as lfi \ - LEFT JOIN data_table_with_presence_indicator ON lfi.person_id = data_table_with_presence_indicator.person_id AND lfi.data_table_concept_id = data_table_with_presence_indicator.{DATA_TABLE_CONCEPT_ID_TRUNK[data_table]}_concept_id AND data_table_with_presence_indicator.{DATA_TABLE_DATE_TRUNK[data_table]}_{date_prefix}date BETWEEN lfi.interval_start AND lfi.interval_end \ + LEFT JOIN data_table_with_presence_indicator ON lfi.person_id = data_table_with_presence_indicator.person_id AND lfi.data_table_concept_id = data_table_with_presence_indicator.{DATA_TABLE_CONCEPT_ID_TRUNK[data_table]}_concept_id AND data_table_with_presence_indicator.{DATA_TABLE_DATE_KEYS[date_prefix][data_table]} BETWEEN lfi.interval_start AND lfi.interval_end \ GROUP BY lfi.person_id, lfi.data_table_concept_id, interval_step, interval_start, interval_end """ ).df() @@ -200,7 +231,7 @@ def time_interval_table_for_interval_tables_query_long_format( df = backend_handle.execute( f""" WITH person_time_defining_table AS ( \ - SELECT person.person_id as person_id, {START_DATE_KEY[time_defining_table]} as start_date, {END_DATE_KEY[time_defining_table]} as end_date \ + SELECT person.person_id as person_id, {DATA_TABLE_DATE_KEYS["start"][time_defining_table]} as start_date, {DATA_TABLE_DATE_KEYS["end"][time_defining_table]} as end_date \ FROM person \ JOIN {time_defining_table} ON person.person_id = {time_defining_table}.{TIME_DEFINING_TABLE_SUBJECT_KEY[time_defining_table]} \ ), \ @@ -231,9 +262,9 @@ def time_interval_table_for_interval_tables_query_long_format( FROM long_format_intervals as lfi \ LEFT JOIN data_table_with_presence_indicator ON lfi.person_id = data_table_with_presence_indicator.person_id \ AND lfi.data_table_concept_id = data_table_with_presence_indicator.{DATA_TABLE_CONCEPT_ID_TRUNK[data_table]}_concept_id \ - AND (data_table_with_presence_indicator.{DATA_TABLE_DATE_TRUNK[data_table]}_start_date BETWEEN lfi.interval_start AND lfi.interval_end \ - OR data_table_with_presence_indicator.{DATA_TABLE_DATE_TRUNK[data_table]}_end_date BETWEEN lfi.interval_start AND lfi.interval_end \ - OR (data_table_with_presence_indicator.{DATA_TABLE_DATE_TRUNK[data_table]}_start_date < lfi.interval_start AND data_table_with_presence_indicator.{DATA_TABLE_DATE_TRUNK[data_table]}_end_date > lfi.interval_end)) \ + AND (data_table_with_presence_indicator.{DATA_TABLE_DATE_KEYS["start"][data_table]} BETWEEN lfi.interval_start AND lfi.interval_end \ + OR data_table_with_presence_indicator.{DATA_TABLE_DATE_KEYS["end"][data_table]} BETWEEN lfi.interval_start AND lfi.interval_end \ + OR (data_table_with_presence_indicator.{DATA_TABLE_DATE_KEYS["start"][data_table]} < lfi.interval_start AND data_table_with_presence_indicator.{DATA_TABLE_DATE_KEYS["end"][data_table]} > lfi.interval_end)) \ GROUP BY lfi.person_id, lfi.data_table_concept_id, interval_step, interval_start, interval_end """ ).df() diff --git a/tests/data/toy_omop/vanilla/procedure_occurrence.csv b/tests/data/toy_omop/vanilla/procedure_occurrence.csv new file mode 100644 index 0000000..90fe7b1 --- /dev/null +++ b/tests/data/toy_omop/vanilla/procedure_occurrence.csv @@ -0,0 +1,10 @@ +procedure_occurrence_id,person_id,procedure_concept_id,procedure_date,procedure_datetime,procedure_end_date,procedure_end_datetime,procedure_type_concept_id,modifier_concept_id,quantity,provider_id,visit_occurrence_id,visit_detail_id,procedure_source_value,procedure_source_concept_id,modifier_source_value +1,1,4326177,2100-01-01,2100-01-01 12:00:00,2100-01-31,2100-01-31 00:00:00,38000267,0,,0,31,,430193006,1121000119107, +2,1,4326177,2100-02-01,2100-02-01 12:00:00,2100-02-28,2100-02-28 00:00:00,38000267,0,,0,28,,430193006,1121000119107, +3,1,4107731,2100-01-01,2100-01-01 12:00:00,2100-01-31,2100-01-31 00:00:00,38000267,0,,0,31,,180256009,4107731, +4,2,4326177,2100-01-01,2100-01-01 12:00:00,2100-01-31,2100-01-31 00:00:00,38000267,0,,0,31,,430193006,1121000119107, +5,2,4326177,2100-02-01,2100-02-01 12:00:00,2100-02-28,2100-02-28 00:00:00,38000267,0,,0,28,,430193006,1121000119107, +6,2,4107731,2100-01-01,2100-01-01 12:00:00,2100-01-31,2100-01-31 00:00:00,38000267,0,,0,31,,180256009,4107731, +7,3,4326177,2100-01-01,2100-01-01 12:00:00,2100-01-31,2100-01-31 00:00:00,38000267,0,,0,31,,430193006,1121000119107, +8,3,4326177,2100-02-01,2100-02-01 12:00:00,2100-02-28,2100-02-28 00:00:00,38000267,0,,0,28,,430193006,1121000119107, +9,3,4107731,2100-01-01,2100-01-01 12:00:00,2100-01-31,2100-01-31 00:00:00,38000267,0,,0,31,,180256009,4107731, diff --git a/tests/test_io/test_omop.py b/tests/test_io/test_omop.py index 6bc101d..e8d503c 100644 --- a/tests/test_io/test_omop.py +++ b/tests/test_io/test_omop.py @@ -17,6 +17,7 @@ "specimen": 2, "drug_exposure": 2, "condition_occurrence": 2, + "procedure_occurrence": 2, } # constants for setup_variables @@ -322,7 +323,66 @@ def test_setup_variables( [[1, 1, 1, 1], [1, 1, 1, 1]], ], ), - # (["procedure_occurrence"], ["is_present"]), # TODO: write test file + ( + ["procedure_occurrence"], + ["procedure_source_value"], + "start", + [ + [[180256009, np.nan, np.nan, np.nan], [430193006, np.nan, np.nan, np.nan]], + [[180256009, np.nan, np.nan, np.nan], [430193006, np.nan, np.nan, np.nan]], + [[180256009, np.nan, np.nan, np.nan], [430193006, np.nan, np.nan, np.nan]], + ], + ), + ( + ["procedure_occurrence"], + ["procedure_source_value"], + "end", + [ + [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], + [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], + [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], + ], + ), + ( + ["procedure_occurrence"], + ["procedure_source_value"], + "interval", + [ + [[180256009, 180256009, 180256009, 180256009], [430193006, 430193006, 430193006, 430193006]], + [[180256009, 180256009, 180256009, 180256009], [430193006, 430193006, 430193006, 430193006]], + [[180256009, 180256009, 180256009, 180256009], [430193006, 430193006, 430193006, 430193006]], + ], + ), + ( + ["procedure_occurrence"], + ["is_present"], + "start", + [ + [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], + [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], + [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], + ], + ), + ( + ["procedure_occurrence"], + ["is_present"], + "end", + [ + [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], + [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], + [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], + ], + ), + ( + ["procedure_occurrence"], + ["is_present"], + "interval", + [ + [[1, 1, 1, 1], [1, 1, 1, 1]], + [[1, 1, 1, 1], [1, 1, 1, 1]], + [[1, 1, 1, 1], [1, 1, 1, 1]], + ], + ), # (["device_exposure"], ["is_present"]), # TODO: write test file # (["note"], ["is_present"]), ], From 6b0e57ebe5c5b5cadd08f5108bdc816257f70cbe Mon Sep 17 00:00:00 2001 From: Eljas Roellin Date: Fri, 22 Nov 2024 14:05:47 +0100 Subject: [PATCH 38/43] support device_exposure --- src/ehrdata/io/omop/_check_arguments.py | 7 ++- src/ehrdata/io/omop/_queries.py | 3 + .../data/toy_omop/vanilla/device_exposure.csv | 10 +++ tests/test_io/test_omop.py | 62 ++++++++++++++++++- 4 files changed, 80 insertions(+), 2 deletions(-) create mode 100644 tests/data/toy_omop/vanilla/device_exposure.csv diff --git a/src/ehrdata/io/omop/_check_arguments.py b/src/ehrdata/io/omop/_check_arguments.py index fcd2d78..5da1737 100644 --- a/src/ehrdata/io/omop/_check_arguments.py +++ b/src/ehrdata/io/omop/_check_arguments.py @@ -12,7 +12,12 @@ VALID_OBSERVATION_TABLES_SINGLE = ["person"] VALID_OBSERVATION_TABLES_JOIN = ["person_cohort", "person_observation_period", "person_visit_occurrence"] VALID_VARIABLE_TABLES = ["measurement", "observation", "specimen"] -VALID_INTERVAL_VARIABLE_TABLES = ["drug_exposure", "condition_occurrence", "procedure_occurrence"] +VALID_INTERVAL_VARIABLE_TABLES = [ + "drug_exposure", + "condition_occurrence", + "procedure_occurrence", + "device_exposure", +] VALID_KEEP_DATES = ["start", "end", "interval"] diff --git a/src/ehrdata/io/omop/_queries.py b/src/ehrdata/io/omop/_queries.py index 87eab08..3e15f47 100644 --- a/src/ehrdata/io/omop/_queries.py +++ b/src/ehrdata/io/omop/_queries.py @@ -32,6 +32,7 @@ "drug_exposure": "drug", "condition_occurrence": "condition", "procedure_occurrence": "procedure", + "device_exposure": "device", } # DATA_TABLE_DATE_TRUNK = { # "measurement": "measurement", @@ -55,6 +56,7 @@ "drug_exposure": "drug_exposure_start_date", "condition_occurrence": "condition_start_date", "procedure_occurrence": "procedure_date", # in v5.3, procedure didnt have end date + "device_exposure": "device_exposure_start_date", }, "end": { "visit_occurrence": "visit_end_date", @@ -63,6 +65,7 @@ "drug_exposure": "drug_exposure_end_date", "condition_occurrence": "condition_end_date", "procedure_occurrence": "procedure_end_date", # in v5.3, procedure didnt have end date TODO v5.3 support + "device_exposure": "device_exposure_end_date", }, } diff --git a/tests/data/toy_omop/vanilla/device_exposure.csv b/tests/data/toy_omop/vanilla/device_exposure.csv new file mode 100644 index 0000000..d84c862 --- /dev/null +++ b/tests/data/toy_omop/vanilla/device_exposure.csv @@ -0,0 +1,10 @@ +device_exposure_id,person_id,device_concept_id,device_exposure_start_date,device_exposure_start_datetime,device_exposure_end_date,device_exposure_end_datetime,device_type_concept_id,unique_device_id,production_id,quantity,provider_id,visit_occurrence_id,visit_detail_id,device_source_value,device_source_concept_id,unit_concept_id,unit_source_value,unit_source_concept_id +1,1,4217646,2100-01-01,2100-01-01 12:00:00,2100-01-31,2100-01-31 00:00:00,38000267,,,,,,,72506001,4217646,,, +2,1,4217646,2100-02-01,2100-02-01 12:00:00,2100-02-28,2100-02-28 00:00:00,38000267,,,,,,,72506001,4217646,,, +3,1,45768171,2100-01-01,2100-01-01 12:00:00,2100-01-31,2100-01-31 00:00:00,32817,,,,,,,224087,2000030021,,, +4,2,4217646,2100-01-01,2100-01-01 12:00:00,2100-01-31,2100-01-31 00:00:00,38000267,,,,,,,72506001,4217646,,, +5,2,4217646,2100-02-01,2100-02-01 12:00:00,2100-02-28,2100-02-28 00:00:00,38000267,,,,,,,72506001,4217646,,, +6,2,45768171,2100-01-01,2100-01-01 12:00:00,2100-01-31,2100-01-31 00:00:00,32817,,,,,,,224087,2000030021,,, +7,3,4217646,2100-01-01,2100-01-01 12:00:00,2100-01-31,2100-01-31 00:00:00,38000267,,,,,,,72506001,4217646,,, +8,3,4217646,2100-02-01,2100-02-01 12:00:00,2100-02-28,2100-02-28 00:00:00,38000267,,,,,,,72506001,4217646,,, +9,3,45768171,2100-01-01,2100-01-01 12:00:00,2100-01-31,2100-01-31 00:00:00,32817,,,,,,,224087,2000030021,,, diff --git a/tests/test_io/test_omop.py b/tests/test_io/test_omop.py index e8d503c..911e967 100644 --- a/tests/test_io/test_omop.py +++ b/tests/test_io/test_omop.py @@ -18,6 +18,7 @@ "drug_exposure": 2, "condition_occurrence": 2, "procedure_occurrence": 2, + "device_exposure": 2, } # constants for setup_variables @@ -383,7 +384,66 @@ def test_setup_variables( [[1, 1, 1, 1], [1, 1, 1, 1]], ], ), - # (["device_exposure"], ["is_present"]), # TODO: write test file + ( + ["device_exposure"], + ["device_source_value"], + "start", + [ + [[72506001, np.nan, np.nan, np.nan], [224087, np.nan, np.nan, np.nan]], + [[72506001, np.nan, np.nan, np.nan], [224087, np.nan, np.nan, np.nan]], + [[72506001, np.nan, np.nan, np.nan], [224087, np.nan, np.nan, np.nan]], + ], + ), + ( + ["device_exposure"], + ["device_source_value"], + "end", + [ + [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], + [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], + [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], + ], + ), + ( + ["device_exposure"], + ["device_source_value"], + "interval", + [ + [[72506001, 72506001, 72506001, 72506001], [224087, 224087, 224087, 224087]], + [[72506001, 72506001, 72506001, 72506001], [224087, 224087, 224087, 224087]], + [[72506001, 72506001, 72506001, 72506001], [224087, 224087, 224087, 224087]], + ], + ), + ( + ["device_exposure"], + ["is_present"], + "start", + [ + [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], + [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], + [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], + ], + ), + ( + ["device_exposure"], + ["is_present"], + "end", + [ + [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], + [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], + [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], + ], + ), + ( + ["device_exposure"], + ["is_present"], + "interval", + [ + [[1, 1, 1, 1], [1, 1, 1, 1]], + [[1, 1, 1, 1], [1, 1, 1, 1]], + [[1, 1, 1, 1], [1, 1, 1, 1]], + ], + ), # (["note"], ["is_present"]), ], ) From 9dc6637fff6eeaafc0f98831195c73192338addd Mon Sep 17 00:00:00 2001 From: Eljas Roellin Date: Fri, 22 Nov 2024 14:36:37 +0100 Subject: [PATCH 39/43] support drug_era --- src/ehrdata/io/omop/_check_arguments.py | 1 + src/ehrdata/io/omop/_queries.py | 3 ++ tests/data/toy_omop/vanilla/drug_era.csv | 10 ++++ tests/test_io/test_omop.py | 62 +++++++++++++++++++++++- 4 files changed, 75 insertions(+), 1 deletion(-) create mode 100644 tests/data/toy_omop/vanilla/drug_era.csv diff --git a/src/ehrdata/io/omop/_check_arguments.py b/src/ehrdata/io/omop/_check_arguments.py index 5da1737..b53e79f 100644 --- a/src/ehrdata/io/omop/_check_arguments.py +++ b/src/ehrdata/io/omop/_check_arguments.py @@ -17,6 +17,7 @@ "condition_occurrence", "procedure_occurrence", "device_exposure", + "drug_era", ] VALID_KEEP_DATES = ["start", "end", "interval"] diff --git a/src/ehrdata/io/omop/_queries.py b/src/ehrdata/io/omop/_queries.py index 3e15f47..4684633 100644 --- a/src/ehrdata/io/omop/_queries.py +++ b/src/ehrdata/io/omop/_queries.py @@ -33,6 +33,7 @@ "condition_occurrence": "condition", "procedure_occurrence": "procedure", "device_exposure": "device", + "drug_era": "drug", } # DATA_TABLE_DATE_TRUNK = { # "measurement": "measurement", @@ -57,6 +58,7 @@ "condition_occurrence": "condition_start_date", "procedure_occurrence": "procedure_date", # in v5.3, procedure didnt have end date "device_exposure": "device_exposure_start_date", + "drug_era": "drug_era_start_date", }, "end": { "visit_occurrence": "visit_end_date", @@ -66,6 +68,7 @@ "condition_occurrence": "condition_end_date", "procedure_occurrence": "procedure_end_date", # in v5.3, procedure didnt have end date TODO v5.3 support "device_exposure": "device_exposure_end_date", + "drug_era": "drug_era_end_date", }, } diff --git a/tests/data/toy_omop/vanilla/drug_era.csv b/tests/data/toy_omop/vanilla/drug_era.csv new file mode 100644 index 0000000..a2dbf2c --- /dev/null +++ b/tests/data/toy_omop/vanilla/drug_era.csv @@ -0,0 +1,10 @@ +drug_era_id,person_id,drug_concept_id,drug_era_start_date,drug_era_end_date,drug_exposure_count,gap_days +1,1,1124957,2100-01-01,2100-01-31,2,1 +2,1,1124957,2100-02-01,2100-02-28,2,1 +3,1,1368671,2100-01-01,2100-01-31,4,3 +4,2,1124957,2100-01-01,2100-01-31,2,1 +5,2,1124957,2100-02-01,2100-02-28,2,1 +6,2,1368671,2100-01-01,2100-01-31,4,3 +7,3,1124957,2100-01-01,2100-01-31,2,2 +8,3,1124957,2100-02-01,2100-02-28,2,1 +9,3,1368671,2100-01-01,2100-01-31,4,3 diff --git a/tests/test_io/test_omop.py b/tests/test_io/test_omop.py index 911e967..f8ea0af 100644 --- a/tests/test_io/test_omop.py +++ b/tests/test_io/test_omop.py @@ -19,6 +19,7 @@ "condition_occurrence": 2, "procedure_occurrence": 2, "device_exposure": 2, + "drug_era": 2, } # constants for setup_variables @@ -444,7 +445,66 @@ def test_setup_variables( [[1, 1, 1, 1], [1, 1, 1, 1]], ], ), - # (["note"], ["is_present"]), + ( + ["drug_era"], + ["drug_exposure_count"], + "start", + [ + [[2, np.nan, np.nan, np.nan], [4, np.nan, np.nan, np.nan]], + [[2, np.nan, np.nan, np.nan], [4, np.nan, np.nan, np.nan]], + [[2, np.nan, np.nan, np.nan], [4, np.nan, np.nan, np.nan]], + ], + ), + ( + ["drug_era"], + ["drug_exposure_count"], + "end", + [ + [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], + [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], + [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], + ], + ), + ( + ["drug_era"], + ["drug_exposure_count"], + "interval", + [ + [[2, 2, 2, 2], [4, 4, 4, 4]], + [[2, 2, 2, 2], [4, 4, 4, 4]], + [[2, 2, 2, 2], [4, 4, 4, 4]], + ], + ), + ( + ["drug_era"], + ["is_present"], + "start", + [ + [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], + [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], + [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], + ], + ), + ( + ["drug_era"], + ["is_present"], + "end", + [ + [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], + [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], + [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], + ], + ), + ( + ["drug_era"], + ["is_present"], + "interval", + [ + [[1, 1, 1, 1], [1, 1, 1, 1]], + [[1, 1, 1, 1], [1, 1, 1, 1]], + [[1, 1, 1, 1], [1, 1, 1, 1]], + ], + ), ], ) @pytest.mark.parametrize( From 0525855ba8de02acdb32a58e97730269c571af46 Mon Sep 17 00:00:00 2001 From: Eljas Roellin Date: Fri, 22 Nov 2024 15:06:27 +0100 Subject: [PATCH 40/43] support dose_era --- src/ehrdata/io/omop/_check_arguments.py | 1 + src/ehrdata/io/omop/_queries.py | 27 ++--------- tests/data/toy_omop/vanilla/dose_era.csv | 10 ++++ tests/test_io/test_omop.py | 61 ++++++++++++++++++++++++ 4 files changed, 75 insertions(+), 24 deletions(-) create mode 100644 tests/data/toy_omop/vanilla/dose_era.csv diff --git a/src/ehrdata/io/omop/_check_arguments.py b/src/ehrdata/io/omop/_check_arguments.py index b53e79f..ad637d6 100644 --- a/src/ehrdata/io/omop/_check_arguments.py +++ b/src/ehrdata/io/omop/_check_arguments.py @@ -18,6 +18,7 @@ "procedure_occurrence", "device_exposure", "drug_era", + "dose_era", ] VALID_KEEP_DATES = ["start", "end", "interval"] diff --git a/src/ehrdata/io/omop/_queries.py b/src/ehrdata/io/omop/_queries.py index 4684633..3a4d002 100644 --- a/src/ehrdata/io/omop/_queries.py +++ b/src/ehrdata/io/omop/_queries.py @@ -3,22 +3,6 @@ import duckdb import pandas as pd -# START_DATE_KEY = { -# "visit_occurrence": "visit_start_date", -# "observation_period": "observation_period_start_date", -# "cohort": "cohort_start_date", -# "drug_exposure": "drug_exposure_start_date", -# "condition_occurrence": "condition_start_date", -# "procedure_occurrence": "procedure_date", # v5.4, as v5.3 had no end! TODO: allow v5.3 too -# } -# END_DATE_KEY = { -# "visit_occurrence": "visit_end_date", -# "observation_period": "observation_period_end_date", -# "cohort": "cohort_end_date", -# "drug_exposure": "drug_exposure_end_date", -# "condition_occurrence": "condition_end_date", -# "procedure_occurrence": "procedure_end_date", # v5.4, as v5.3 had no end! TODO: allow v5.3 too -# } TIME_DEFINING_TABLE_SUBJECT_KEY = { "visit_occurrence": "person_id", "observation_period": "person_id", @@ -34,15 +18,8 @@ "procedure_occurrence": "procedure", "device_exposure": "device", "drug_era": "drug", + "dose_era": "drug", } -# DATA_TABLE_DATE_TRUNK = { -# "measurement": "measurement", -# "observation": "observation", -# "specimen": "specimen", -# "drug_exposure": "drug_exposure", -# "condition_occurrence": "condition", -# "procedure_occurrence": "procedure", -# } DATA_TABLE_DATE_KEYS = { "timepoint": { @@ -59,6 +36,7 @@ "procedure_occurrence": "procedure_date", # in v5.3, procedure didnt have end date "device_exposure": "device_exposure_start_date", "drug_era": "drug_era_start_date", + "dose_era": "dose_era_start_date", }, "end": { "visit_occurrence": "visit_end_date", @@ -69,6 +47,7 @@ "procedure_occurrence": "procedure_end_date", # in v5.3, procedure didnt have end date TODO v5.3 support "device_exposure": "device_exposure_end_date", "drug_era": "drug_era_end_date", + "dose_era": "dose_era_end_date", }, } diff --git a/tests/data/toy_omop/vanilla/dose_era.csv b/tests/data/toy_omop/vanilla/dose_era.csv new file mode 100644 index 0000000..b2c1bf3 --- /dev/null +++ b/tests/data/toy_omop/vanilla/dose_era.csv @@ -0,0 +1,10 @@ +dose_era_id,person_id,drug_concept_id,unit_concept_id,dose_value,dose_era_start_date,dose_era_end_date +1,1,902427,8576,10,2100-01-01,2100-01-31 +2,1,902427,8576,10,2100-02-01,2100-02-28 +3,1,714785,8576,2.5,2100-01-01,2100-01-31 +4,2,902427,8576,10,2100-01-01,2100-01-31 +5,2,902427,8576,10,2100-02-01,2100-02-28 +6,2,714785,8576,2.5,2100-01-01,2100-01-31 +7,3,902427,8576,10,2100-01-01,2100-01-31 +8,3,902427,8576,10,2100-02-01,2100-02-28 +9,3,714785,8576,2.5,2100-01-01,2100-01-31 diff --git a/tests/test_io/test_omop.py b/tests/test_io/test_omop.py index f8ea0af..2a2c205 100644 --- a/tests/test_io/test_omop.py +++ b/tests/test_io/test_omop.py @@ -20,6 +20,7 @@ "procedure_occurrence": 2, "device_exposure": 2, "drug_era": 2, + "dose_era": 2, } # constants for setup_variables @@ -505,6 +506,66 @@ def test_setup_variables( [[1, 1, 1, 1], [1, 1, 1, 1]], ], ), + ( + ["dose_era"], + ["dose_value"], + "start", + [ + [[2.5, np.nan, np.nan, np.nan], [10, np.nan, np.nan, np.nan]], + [[2.5, np.nan, np.nan, np.nan], [10, np.nan, np.nan, np.nan]], + [[2.5, np.nan, np.nan, np.nan], [10, np.nan, np.nan, np.nan]], + ], + ), + ( + ["dose_era"], + ["dose_value"], + "end", + [ + [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], + [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], + [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], + ], + ), + ( + ["dose_era"], + ["dose_value"], + "interval", + [ + [[2.5, 2.5, 2.5, 2.5], [10, 10, 10, 10]], + [[2.5, 2.5, 2.5, 2.5], [10, 10, 10, 10]], + [[2.5, 2.5, 2.5, 2.5], [10, 10, 10, 10]], + ], + ), + ( + ["dose_era"], + ["is_present"], + "start", + [ + [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], + [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], + [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], + ], + ), + ( + ["dose_era"], + ["is_present"], + "end", + [ + [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], + [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], + [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], + ], + ), + ( + ["dose_era"], + ["is_present"], + "interval", + [ + [[1, 1, 1, 1], [1, 1, 1, 1]], + [[1, 1, 1, 1], [1, 1, 1, 1]], + [[1, 1, 1, 1], [1, 1, 1, 1]], + ], + ), ], ) @pytest.mark.parametrize( From e4ad1dbb6517dd3a02fd5b306ec88a9a115e44df Mon Sep 17 00:00:00 2001 From: Eljas Roellin Date: Fri, 22 Nov 2024 15:23:20 +0100 Subject: [PATCH 41/43] support condition_era --- src/ehrdata/io/omop/_check_arguments.py | 1 + src/ehrdata/io/omop/_queries.py | 3 + tests/data/toy_omop/vanilla/condition_era.csv | 10 +++ tests/test_io/test_omop.py | 61 +++++++++++++++++++ 4 files changed, 75 insertions(+) create mode 100644 tests/data/toy_omop/vanilla/condition_era.csv diff --git a/src/ehrdata/io/omop/_check_arguments.py b/src/ehrdata/io/omop/_check_arguments.py index ad637d6..5eb9528 100644 --- a/src/ehrdata/io/omop/_check_arguments.py +++ b/src/ehrdata/io/omop/_check_arguments.py @@ -19,6 +19,7 @@ "device_exposure", "drug_era", "dose_era", + "condition_era", ] VALID_KEEP_DATES = ["start", "end", "interval"] diff --git a/src/ehrdata/io/omop/_queries.py b/src/ehrdata/io/omop/_queries.py index 3a4d002..0885726 100644 --- a/src/ehrdata/io/omop/_queries.py +++ b/src/ehrdata/io/omop/_queries.py @@ -19,6 +19,7 @@ "device_exposure": "device", "drug_era": "drug", "dose_era": "drug", + "condition_era": "condition", } DATA_TABLE_DATE_KEYS = { @@ -37,6 +38,7 @@ "device_exposure": "device_exposure_start_date", "drug_era": "drug_era_start_date", "dose_era": "dose_era_start_date", + "condition_era": "condition_era_start_date", }, "end": { "visit_occurrence": "visit_end_date", @@ -48,6 +50,7 @@ "device_exposure": "device_exposure_end_date", "drug_era": "drug_era_end_date", "dose_era": "dose_era_end_date", + "condition_era": "condition_era_end_date", }, } diff --git a/tests/data/toy_omop/vanilla/condition_era.csv b/tests/data/toy_omop/vanilla/condition_era.csv new file mode 100644 index 0000000..f5c2cf0 --- /dev/null +++ b/tests/data/toy_omop/vanilla/condition_era.csv @@ -0,0 +1,10 @@ +condition_era_id,person_id,condition_concept_id,condition_era_start_date,condition_era_end_date,condition_occurrence_count +1,1,4140598,2100-01-01,2100-01-31,256 +2,1,4140598,2100-02-01,2100-02-28,256 +3,1,434610,2100-01-01,2100-01-31,1 +4,2,4140598,2100-01-01,2100-01-31,256 +5,2,4140598,2100-02-01,2100-02-28,256 +6,2,434610,2100-01-01,2100-01-31,1 +7,3,4140598,2100-01-01,2100-01-31,256 +8,3,4140598,2100-02-01,2100-02-28,256 +9,3,434610,2100-01-01,2100-01-31,1 diff --git a/tests/test_io/test_omop.py b/tests/test_io/test_omop.py index 2a2c205..f649180 100644 --- a/tests/test_io/test_omop.py +++ b/tests/test_io/test_omop.py @@ -21,6 +21,7 @@ "device_exposure": 2, "drug_era": 2, "dose_era": 2, + "condition_era": 2, } # constants for setup_variables @@ -566,6 +567,66 @@ def test_setup_variables( [[1, 1, 1, 1], [1, 1, 1, 1]], ], ), + ( + ["condition_era"], + ["condition_occurrence_count"], + "start", + [ + [[1, np.nan, np.nan, np.nan], [256, np.nan, np.nan, np.nan]], + [[1, np.nan, np.nan, np.nan], [256, np.nan, np.nan, np.nan]], + [[1, np.nan, np.nan, np.nan], [256, np.nan, np.nan, np.nan]], + ], + ), + ( + ["condition_era"], + ["condition_occurrence_count"], + "end", + [ + [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], + [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], + [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], + ], + ), + ( + ["condition_era"], + ["condition_occurrence_count"], + "interval", + [ + [[1, 1, 1, 1], [256, 256, 256, 256]], + [[1, 1, 1, 1], [256, 256, 256, 256]], + [[1, 1, 1, 1], [256, 256, 256, 256]], + ], + ), + ( + ["condition_era"], + ["is_present"], + "start", + [ + [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], + [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], + [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], + ], + ), + ( + ["condition_era"], + ["is_present"], + "end", + [ + [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], + [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], + [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], + ], + ), + ( + ["condition_era"], + ["is_present"], + "interval", + [ + [[1, 1, 1, 1], [1, 1, 1, 1]], + [[1, 1, 1, 1], [1, 1, 1, 1]], + [[1, 1, 1, 1], [1, 1, 1, 1]], + ], + ), ], ) @pytest.mark.parametrize( From 623ebab2a0db365200d9cc93c4daaa4263f2ccc6 Mon Sep 17 00:00:00 2001 From: Eljas Roellin Date: Fri, 22 Nov 2024 16:13:21 +0100 Subject: [PATCH 42/43] support episode --- src/ehrdata/io/omop/_check_arguments.py | 1 + src/ehrdata/io/omop/_queries.py | 3 ++ src/ehrdata/utils/_omop_utils.py | 1 + tests/data/toy_omop/vanilla/episode.csv | 10 ++++ tests/test_dt/test_dt.py | 2 +- tests/test_io/test_omop.py | 61 +++++++++++++++++++++++++ 6 files changed, 77 insertions(+), 1 deletion(-) create mode 100644 tests/data/toy_omop/vanilla/episode.csv diff --git a/src/ehrdata/io/omop/_check_arguments.py b/src/ehrdata/io/omop/_check_arguments.py index 5eb9528..9a5930f 100644 --- a/src/ehrdata/io/omop/_check_arguments.py +++ b/src/ehrdata/io/omop/_check_arguments.py @@ -20,6 +20,7 @@ "drug_era", "dose_era", "condition_era", + "episode", ] VALID_KEEP_DATES = ["start", "end", "interval"] diff --git a/src/ehrdata/io/omop/_queries.py b/src/ehrdata/io/omop/_queries.py index 0885726..2cdaa34 100644 --- a/src/ehrdata/io/omop/_queries.py +++ b/src/ehrdata/io/omop/_queries.py @@ -20,6 +20,7 @@ "drug_era": "drug", "dose_era": "drug", "condition_era": "condition", + "episode": "episode", } DATA_TABLE_DATE_KEYS = { @@ -39,6 +40,7 @@ "drug_era": "drug_era_start_date", "dose_era": "dose_era_start_date", "condition_era": "condition_era_start_date", + "episode": "episode_start_date", }, "end": { "visit_occurrence": "visit_end_date", @@ -51,6 +53,7 @@ "drug_era": "drug_era_end_date", "dose_era": "dose_era_end_date", "condition_era": "condition_era_end_date", + "episode": "episode_end_date", }, } diff --git a/src/ehrdata/utils/_omop_utils.py b/src/ehrdata/utils/_omop_utils.py index 2b52d02..c2b1834 100644 --- a/src/ehrdata/utils/_omop_utils.py +++ b/src/ehrdata/utils/_omop_utils.py @@ -43,6 +43,7 @@ def get_table_catalog_dict(version: Literal["5.4"] = "5.4"): "note_nlp", "observation", "fact_relationship", + "episode", ] table_catalog_dict["Health system data"] = ["location", "care_site", "provider"] diff --git a/tests/data/toy_omop/vanilla/episode.csv b/tests/data/toy_omop/vanilla/episode.csv new file mode 100644 index 0000000..59ecf81 --- /dev/null +++ b/tests/data/toy_omop/vanilla/episode.csv @@ -0,0 +1,10 @@ +episode_id,person_id,episode_concept_id,episode_start_date,episode_start_datetime,episode_end_date,episode_end_datetime,episode_parent_id,episode_number,episode_object_concept_id,episode_type_concept_id,episode_source_value,episode_source_concept_id +1,1,32941,2100-01-01,2100-01-01 12:00:00,2100-01-31,2100-01-31 00:00:00,,,,,10, +2,1,32941,2100-02-01,2100-02-01 12:00:00,2100-02-28,2100-02-28 00:00:00,,,,,10, +3,1,32531,2100-01-01,2100-01-01 12:00:00,2100-01-31,2100-01-31 00:00:00,,,,,5, +4,2,32941,2100-01-01,2100-01-01 12:00:00,2100-01-31,2100-01-31 00:00:00,,,,,10, +5,2,32941,2100-02-01,2100-02-01 12:00:00,2100-02-28,2100-02-28 00:00:00,,,,,10, +6,2,32531,2100-01-01,2100-01-01 12:00:00,2100-01-31,2100-01-31 00:00:00,,,,,5, +7,3,32941,2100-01-01,2100-01-01 12:00:00,2100-01-31,2100-01-31 00:00:00,,,,,10, +8,3,32941,2100-02-01,2100-02-01 12:00:00,2100-02-28,2100-02-28 00:00:00,,,,,10, +9,3,32531,2100-01-01,2100-01-01 12:00:00,2100-01-31,2100-01-31 00:00:00,,,,,5, diff --git a/tests/test_dt/test_dt.py b/tests/test_dt/test_dt.py index 02fb030..65accff 100644 --- a/tests/test_dt/test_dt.py +++ b/tests/test_dt/test_dt.py @@ -34,7 +34,7 @@ def test_gibleed_omop(tmp_path): def test_synthea27nj_omop(tmp_path): duckdb_connection = duckdb.connect() ed.dt.synthea27nj_omop(data_path=tmp_path, backend_handle=duckdb_connection) - assert len(duckdb_connection.execute("SHOW TABLES").df()) == 37 + assert len(duckdb_connection.execute("SHOW TABLES").df()) == 38 # sanity check of one table assert duckdb_connection.execute("SELECT * FROM person").df().shape == (28, 18) duckdb_connection.close() diff --git a/tests/test_io/test_omop.py b/tests/test_io/test_omop.py index f649180..5e77731 100644 --- a/tests/test_io/test_omop.py +++ b/tests/test_io/test_omop.py @@ -22,6 +22,7 @@ "drug_era": 2, "dose_era": 2, "condition_era": 2, + "episode": 2, } # constants for setup_variables @@ -627,6 +628,66 @@ def test_setup_variables( [[1, 1, 1, 1], [1, 1, 1, 1]], ], ), + ( + ["episode"], + ["episode_source_value"], + "start", + [ + [[5, np.nan, np.nan, np.nan], [10, np.nan, np.nan, np.nan]], + [[5, np.nan, np.nan, np.nan], [10, np.nan, np.nan, np.nan]], + [[5, np.nan, np.nan, np.nan], [10, np.nan, np.nan, np.nan]], + ], + ), + ( + ["episode"], + ["episode_source_value"], + "end", + [ + [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], + [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], + [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], + ], + ), + ( + ["episode"], + ["episode_source_value"], + "interval", + [ + [[5, 5, 5, 5], [10, 10, 10, 10]], + [[5, 5, 5, 5], [10, 10, 10, 10]], + [[5, 5, 5, 5], [10, 10, 10, 10]], + ], + ), + ( + ["episode"], + ["is_present"], + "start", + [ + [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], + [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], + [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], + ], + ), + ( + ["episode"], + ["is_present"], + "end", + [ + [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], + [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], + [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], + ], + ), + ( + ["episode"], + ["is_present"], + "interval", + [ + [[1, 1, 1, 1], [1, 1, 1, 1]], + [[1, 1, 1, 1], [1, 1, 1, 1]], + [[1, 1, 1, 1], [1, 1, 1, 1]], + ], + ), ], ) @pytest.mark.parametrize( From 8b070c47f27505f5d6c05ce05f1d83b5d66651dc Mon Sep 17 00:00:00 2001 From: eroell Date: Fri, 22 Nov 2024 17:02:45 +0100 Subject: [PATCH 43/43] Refactor time interval table query function --- src/ehrdata/io/omop/_queries.py | 94 +++-------------- src/ehrdata/io/omop/omop.py | 54 +++------- tests/test_io/test_omop.py | 180 ++++++++------------------------ 3 files changed, 79 insertions(+), 249 deletions(-) diff --git a/src/ehrdata/io/omop/_queries.py b/src/ehrdata/io/omop/_queries.py index 2cdaa34..f1937c5 100644 --- a/src/ehrdata/io/omop/_queries.py +++ b/src/ehrdata/io/omop/_queries.py @@ -115,7 +115,7 @@ def _generate_value_query(data_table: str, data_field_to_keep: Sequence, aggrega return is_present_query + value_query -def time_interval_table_query_long_format( +def _time_interval_table( backend_handle: duckdb.duckdb.DuckDBPyConnection, time_defining_table: str, data_table: str, @@ -124,14 +124,13 @@ def time_interval_table_query_long_format( num_intervals: int, aggregation_strategy: str, data_field_to_keep: Sequence[str] | str, - date_prefix: str = "", -) -> pd.DataFrame: - """Returns a long format DataFrame from the data_table. The following columns should be considered the indices of this long format: person_id, data_table_concept_id, interval_step. The other columns, except for start_date and end_date, should be considered the values.""" + keep_date: str = "", +): if isinstance(data_field_to_keep, str): data_field_to_keep = [data_field_to_keep] - if date_prefix == "": - date_prefix = "timepoint" + if keep_date == "": + keep_date = "timepoint" timedeltas_dataframe = _generate_timedeltas(interval_length_number, interval_length_unit, num_intervals) @@ -146,8 +145,7 @@ def time_interval_table_query_long_format( # 3. Create long_format_backbone, which is the left join of person_time_defining_table and person_data_table. # 4. Create long_format_intervals, which is the cross product of long_format_backbone and timedeltas. This table contains most notably the person_id, the concept_id, the interval start and end dates. # 5. Create the final table, which is the join with the data_table (typically measurement); each measurement is assigned to its person_id, its concept_id, and the interval it fits into. - df = backend_handle.execute( - f""" + prepare_alias_query = f""" WITH person_time_defining_table AS ( \ SELECT person.person_id as person_id, {DATA_TABLE_DATE_KEYS["start"][time_defining_table]} as start_date, {DATA_TABLE_DATE_KEYS["end"][time_defining_table]} as end_date \ FROM person \ @@ -176,79 +174,18 @@ def time_interval_table_query_long_format( SELECT *, 1 as is_present \ FROM {data_table} \ ) \ + """ + + if keep_date in ["timepoint", "start", "end"]: + select_query = f""" SELECT lfi.person_id, lfi.data_table_concept_id, interval_step, interval_start, interval_end, {_generate_value_query("data_table_with_presence_indicator", data_field_to_keep, AGGREGATION_STRATEGY_KEY[aggregation_strategy])} \ FROM long_format_intervals as lfi \ - LEFT JOIN data_table_with_presence_indicator ON lfi.person_id = data_table_with_presence_indicator.person_id AND lfi.data_table_concept_id = data_table_with_presence_indicator.{DATA_TABLE_CONCEPT_ID_TRUNK[data_table]}_concept_id AND data_table_with_presence_indicator.{DATA_TABLE_DATE_KEYS[date_prefix][data_table]} BETWEEN lfi.interval_start AND lfi.interval_end \ + LEFT JOIN data_table_with_presence_indicator ON lfi.person_id = data_table_with_presence_indicator.person_id AND lfi.data_table_concept_id = data_table_with_presence_indicator.{DATA_TABLE_CONCEPT_ID_TRUNK[data_table]}_concept_id AND data_table_with_presence_indicator.{DATA_TABLE_DATE_KEYS[keep_date][data_table]} BETWEEN lfi.interval_start AND lfi.interval_end \ GROUP BY lfi.person_id, lfi.data_table_concept_id, interval_step, interval_start, interval_end """ - ).df() - - _drop_timedeltas(backend_handle) - return df - - -def time_interval_table_for_interval_tables_query_long_format( - backend_handle: duckdb.duckdb.DuckDBPyConnection, - time_defining_table: str, - data_table: str, - interval_length_number: int, - interval_length_unit: str, - num_intervals: int, - aggregation_strategy: str, - data_field_to_keep: Sequence[str] | str, - date_prefix: str = "", -) -> pd.DataFrame: - """Returns a long format DataFrame from the data_table. The following columns should be considered the indices of this long format: person_id, data_table_concept_id, interval_step. The other columns, except for start_date and end_date, should be considered the values.""" - if isinstance(data_field_to_keep, str): - data_field_to_keep = [data_field_to_keep] - - if date_prefix != "": - date_prefix = date_prefix + "_" - - timedeltas_dataframe = _generate_timedeltas(interval_length_number, interval_length_unit, num_intervals) - - _write_timedeltas_to_db( - backend_handle, - timedeltas_dataframe, - ) - - # multi-step query - # 1. Create person_time_defining_table, which matches the one created for obs. Needs to contain the person_id, and the start date in particular. - # 2. Create person_data_table (data_table is typically measurement), which contains the cross product of person_id and the distinct concept_id s. - # 3. Create long_format_backbone, which is the left join of person_time_defining_table and person_data_table. - # 4. Create long_format_intervals, which is the cross product of long_format_backbone and timedeltas. This table contains most notably the person_id, the concept_id, the interval start and end dates. - # 5. Create the final table, which is the join with the data_table (typically measurement); each measurement is assigned to its person_id, its concept_id, and the interval it fits into. - df = backend_handle.execute( - f""" - WITH person_time_defining_table AS ( \ - SELECT person.person_id as person_id, {DATA_TABLE_DATE_KEYS["start"][time_defining_table]} as start_date, {DATA_TABLE_DATE_KEYS["end"][time_defining_table]} as end_date \ - FROM person \ - JOIN {time_defining_table} ON person.person_id = {time_defining_table}.{TIME_DEFINING_TABLE_SUBJECT_KEY[time_defining_table]} \ - ), \ - person_data_table AS( \ - WITH distinct_data_table_concept_ids AS ( \ - SELECT DISTINCT {DATA_TABLE_CONCEPT_ID_TRUNK[data_table]}_concept_id - FROM {data_table} \ - ) - SELECT person.person_id, {DATA_TABLE_CONCEPT_ID_TRUNK[data_table]}_concept_id as data_table_concept_id \ - FROM person \ - CROSS JOIN distinct_data_table_concept_ids \ - ), \ - long_format_backbone as ( \ - SELECT person_time_defining_table.person_id, data_table_concept_id, start_date, end_date \ - FROM person_time_defining_table \ - LEFT JOIN person_data_table USING(person_id)\ - ), \ - long_format_intervals as ( \ - SELECT person_id, data_table_concept_id, interval_step, start_date, start_date + interval_start_offset as interval_start, start_date + interval_end_offset as interval_end \ - FROM long_format_backbone \ - CROSS JOIN timedeltas \ - ), \ - data_table_with_presence_indicator as( \ - SELECT *, 1 as is_present \ - FROM {data_table} \ - ) \ + elif keep_date == "interval": + select_query = f""" SELECT lfi.person_id, lfi.data_table_concept_id, interval_step, interval_start, interval_end, {_generate_value_query("data_table_with_presence_indicator", data_field_to_keep, AGGREGATION_STRATEGY_KEY[aggregation_strategy])} \ FROM long_format_intervals as lfi \ LEFT JOIN data_table_with_presence_indicator ON lfi.person_id = data_table_with_presence_indicator.person_id \ @@ -258,7 +195,10 @@ def time_interval_table_for_interval_tables_query_long_format( OR (data_table_with_presence_indicator.{DATA_TABLE_DATE_KEYS["start"][data_table]} < lfi.interval_start AND data_table_with_presence_indicator.{DATA_TABLE_DATE_KEYS["end"][data_table]} > lfi.interval_end)) \ GROUP BY lfi.person_id, lfi.data_table_concept_id, interval_step, interval_start, interval_end """ - ).df() + + query = prepare_alias_query + select_query + + df = backend_handle.execute(query).df() _drop_timedeltas(backend_handle) diff --git a/src/ehrdata/io/omop/omop.py b/src/ehrdata/io/omop/omop.py index 85d8de7..8f8a5c4 100644 --- a/src/ehrdata/io/omop/omop.py +++ b/src/ehrdata/io/omop/omop.py @@ -31,10 +31,7 @@ _check_valid_observation_table, _check_valid_variable_data_tables, ) -from ehrdata.io.omop._queries import ( - time_interval_table_for_interval_tables_query_long_format, - time_interval_table_query_long_format, -) +from ehrdata.io.omop._queries import _time_interval_table from ehrdata.utils._omop_utils import get_table_catalog_dict DOWNLOAD_VERIFICATION_TAG = "download_verification_tag" @@ -335,7 +332,7 @@ def setup_variables( return edata ds = ( - time_interval_table_query_long_format( + _time_interval_table( backend_handle=backend_handle, time_defining_table=time_defining_table, data_table=data_tables[0], @@ -437,7 +434,7 @@ def setup_interval_variables( Strategy to use when aggregating multiple data points within one interval. enrich_var_with_feature_info Whether to enrich the var table with feature information. If a concept_id is not found in the concept table, the feature information will be NaN. - keep_date + date_type Whether to keep the start or end date, or the interval span. Returns @@ -469,38 +466,21 @@ def setup_interval_variables( logging.info(f"No data in {data_tables}.") return edata - if keep_date == "start" or keep_date == "end": - ds = ( - time_interval_table_query_long_format( - backend_handle=backend_handle, - time_defining_table=time_defining_table, - data_table=data_tables[0], - data_field_to_keep=data_field_to_keep, - interval_length_number=interval_length_number, - interval_length_unit=interval_length_unit, - num_intervals=num_intervals, - aggregation_strategy=aggregation_strategy, - date_prefix=keep_date, - ) - .set_index(["person_id", "data_table_concept_id", "interval_step"]) - .to_xarray() - ) - elif keep_date == "interval": - ds = ( - time_interval_table_for_interval_tables_query_long_format( - backend_handle=backend_handle, - time_defining_table=time_defining_table, - data_table=data_tables[0], - data_field_to_keep=data_field_to_keep, - interval_length_number=interval_length_number, - interval_length_unit=interval_length_unit, - num_intervals=num_intervals, - aggregation_strategy=aggregation_strategy, - date_prefix=keep_date, - ) - .set_index(["person_id", "data_table_concept_id", "interval_step"]) - .to_xarray() + ds = ( + _time_interval_table( + backend_handle=backend_handle, + time_defining_table=time_defining_table, + data_table=data_tables[0], + data_field_to_keep=data_field_to_keep, + interval_length_number=interval_length_number, + interval_length_unit=interval_length_unit, + num_intervals=num_intervals, + aggregation_strategy=aggregation_strategy, + keep_date=keep_date, ) + .set_index(["person_id", "data_table_concept_id", "interval_step"]) + .to_xarray() + ) var = ds["data_table_concept_id"].to_dataframe() diff --git a/tests/test_io/test_omop.py b/tests/test_io/test_omop.py index 5e77731..ac426e7 100644 --- a/tests/test_io/test_omop.py +++ b/tests/test_io/test_omop.py @@ -25,6 +25,24 @@ "episode": 2, } +VANILLA_IS_PRESENT_START = [ + [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], + [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], + [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], +] + +VANILLA_IS_PRESENT_END = [ + [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], + [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], + [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], +] + +VANILLA_IS_PRESENT_INTERVAL = [ + [[1, 1, 1, 1], [1, 1, 1, 1]], + [[1, 1, 1, 1], [1, 1, 1, 1]], + [[1, 1, 1, 1], [1, 1, 1, 1]], +] + # constants for setup_variables # only data_table_concept_id VAR_DIM_BASE = 1 @@ -113,11 +131,7 @@ def test_setup_obs_invalid_observation_table_value(omop_connection_vanilla): ( ["measurement"], ["is_present"], - [ - [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], - [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], - [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], - ], + VANILLA_IS_PRESENT_START, ), ( ["observation"], @@ -131,11 +145,7 @@ def test_setup_obs_invalid_observation_table_value(omop_connection_vanilla): ( ["observation"], ["is_present"], - [ - [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], - [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], - [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], - ], + VANILLA_IS_PRESENT_START, ), ( ["specimen"], @@ -149,11 +159,7 @@ def test_setup_obs_invalid_observation_table_value(omop_connection_vanilla): ( ["specimen"], ["is_present"], - [ - [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], - [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], - [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], - ], + VANILLA_IS_PRESENT_START, ), ], ) @@ -242,31 +248,19 @@ def test_setup_variables( ["drug_exposure"], ["is_present"], "start", - [ - [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], - [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], - [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], - ], + VANILLA_IS_PRESENT_START, ), ( ["drug_exposure"], ["is_present"], "end", - [ - [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], - [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], - [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], - ], + VANILLA_IS_PRESENT_END, ), ( ["drug_exposure"], ["is_present"], "interval", - [ - [[1, 1, 1, 1], [1, 1, 1, 1]], - [[1, 1, 1, 1], [1, 1, 1, 1]], - [[1, 1, 1, 1], [1, 1, 1, 1]], - ], + VANILLA_IS_PRESENT_INTERVAL, ), ( ["condition_occurrence"], @@ -302,31 +296,19 @@ def test_setup_variables( ["condition_occurrence"], ["is_present"], "start", - [ - [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], - [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], - [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], - ], + VANILLA_IS_PRESENT_START, ), ( ["condition_occurrence"], ["is_present"], "end", - [ - [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], - [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], - [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], - ], + VANILLA_IS_PRESENT_END, ), ( ["condition_occurrence"], ["is_present"], "interval", - [ - [[1, 1, 1, 1], [1, 1, 1, 1]], - [[1, 1, 1, 1], [1, 1, 1, 1]], - [[1, 1, 1, 1], [1, 1, 1, 1]], - ], + VANILLA_IS_PRESENT_INTERVAL, ), ( ["procedure_occurrence"], @@ -362,31 +344,19 @@ def test_setup_variables( ["procedure_occurrence"], ["is_present"], "start", - [ - [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], - [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], - [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], - ], + VANILLA_IS_PRESENT_START, ), ( ["procedure_occurrence"], ["is_present"], "end", - [ - [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], - [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], - [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], - ], + VANILLA_IS_PRESENT_END, ), ( ["procedure_occurrence"], ["is_present"], "interval", - [ - [[1, 1, 1, 1], [1, 1, 1, 1]], - [[1, 1, 1, 1], [1, 1, 1, 1]], - [[1, 1, 1, 1], [1, 1, 1, 1]], - ], + VANILLA_IS_PRESENT_INTERVAL, ), ( ["device_exposure"], @@ -422,31 +392,19 @@ def test_setup_variables( ["device_exposure"], ["is_present"], "start", - [ - [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], - [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], - [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], - ], + VANILLA_IS_PRESENT_START, ), ( ["device_exposure"], ["is_present"], "end", - [ - [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], - [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], - [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], - ], + VANILLA_IS_PRESENT_END, ), ( ["device_exposure"], ["is_present"], "interval", - [ - [[1, 1, 1, 1], [1, 1, 1, 1]], - [[1, 1, 1, 1], [1, 1, 1, 1]], - [[1, 1, 1, 1], [1, 1, 1, 1]], - ], + VANILLA_IS_PRESENT_INTERVAL, ), ( ["drug_era"], @@ -482,31 +440,19 @@ def test_setup_variables( ["drug_era"], ["is_present"], "start", - [ - [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], - [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], - [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], - ], + VANILLA_IS_PRESENT_START, ), ( ["drug_era"], ["is_present"], "end", - [ - [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], - [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], - [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], - ], + VANILLA_IS_PRESENT_END, ), ( ["drug_era"], ["is_present"], "interval", - [ - [[1, 1, 1, 1], [1, 1, 1, 1]], - [[1, 1, 1, 1], [1, 1, 1, 1]], - [[1, 1, 1, 1], [1, 1, 1, 1]], - ], + VANILLA_IS_PRESENT_INTERVAL, ), ( ["dose_era"], @@ -542,31 +488,19 @@ def test_setup_variables( ["dose_era"], ["is_present"], "start", - [ - [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], - [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], - [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], - ], + VANILLA_IS_PRESENT_START, ), ( ["dose_era"], ["is_present"], "end", - [ - [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], - [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], - [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], - ], + VANILLA_IS_PRESENT_END, ), ( ["dose_era"], ["is_present"], "interval", - [ - [[1, 1, 1, 1], [1, 1, 1, 1]], - [[1, 1, 1, 1], [1, 1, 1, 1]], - [[1, 1, 1, 1], [1, 1, 1, 1]], - ], + VANILLA_IS_PRESENT_INTERVAL, ), ( ["condition_era"], @@ -602,31 +536,19 @@ def test_setup_variables( ["condition_era"], ["is_present"], "start", - [ - [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], - [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], - [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], - ], + VANILLA_IS_PRESENT_START, ), ( ["condition_era"], ["is_present"], "end", - [ - [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], - [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], - [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], - ], + VANILLA_IS_PRESENT_END, ), ( ["condition_era"], ["is_present"], "interval", - [ - [[1, 1, 1, 1], [1, 1, 1, 1]], - [[1, 1, 1, 1], [1, 1, 1, 1]], - [[1, 1, 1, 1], [1, 1, 1, 1]], - ], + VANILLA_IS_PRESENT_INTERVAL, ), ( ["episode"], @@ -662,31 +584,19 @@ def test_setup_variables( ["episode"], ["is_present"], "start", - [ - [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], - [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], - [[1, np.nan, np.nan, np.nan], [1, np.nan, np.nan, np.nan]], - ], + VANILLA_IS_PRESENT_START, ), ( ["episode"], ["is_present"], "end", - [ - [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], - [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], - [[np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan]], - ], + VANILLA_IS_PRESENT_END, ), ( ["episode"], ["is_present"], "interval", - [ - [[1, 1, 1, 1], [1, 1, 1, 1]], - [[1, 1, 1, 1], [1, 1, 1, 1]], - [[1, 1, 1, 1], [1, 1, 1, 1]], - ], + VANILLA_IS_PRESENT_INTERVAL, ), ], )