From 03f47b457bef22c88dfbb16300fde090750a881b Mon Sep 17 00:00:00 2001 From: Matt Garber Date: Fri, 9 Aug 2024 09:27:22 -0400 Subject: [PATCH 1/3] Added convenience subtables This adds two derived tables, mrrel_is_a, and mrconso_drugs, in order to speed up long queries in vocab lookup scenarios. It also bumps the version as part of the library 3.0 cutover, and moves the version definition to __init__. --- README.md | 11 +++++++++++ cumulus_library_umls/__init__.py | 1 + cumulus_library_umls/umls/umls_builder.py | 23 ++++++++++++++++++----- pyproject.toml | 6 +++--- 4 files changed, 33 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index bf32867..b954b82 100644 --- a/README.md +++ b/README.md @@ -23,6 +23,17 @@ Note: This study is explicitly namespaced in its own schema, `umls`. Make sure y database is not using this schema for another use. Do not create tables inside this schema by another means. +## Additional custom tables + +The following tables are a derived from the primary tables, and are included here as a +convenience to avoid having to compute these on a repeated basis + +- **mrrel_is_a** a subset of the relationships in mrrel, including only those that define +that concept A is a member of concept B (i.e. is a child, or is explicitly marked as +being a tradename/member belonging to the parent concept). +- **mrconso_drugs** a subset of the entity list in mrconso, limited to vocabularies +specifically dealing with drug identifiers (i.e. SNOMED, RxNorm, etc.) + ## Licensing details The `cumulus-library-umls` study is provided as a convenience to install the diff --git a/cumulus_library_umls/__init__.py b/cumulus_library_umls/__init__.py index e69de29..5becc17 100644 --- a/cumulus_library_umls/__init__.py +++ b/cumulus_library_umls/__init__.py @@ -0,0 +1 @@ +__version__ = "1.0.0" diff --git a/cumulus_library_umls/umls/umls_builder.py b/cumulus_library_umls/umls/umls_builder.py index f7ab343..ecd4626 100644 --- a/cumulus_library_umls/umls/umls_builder.py +++ b/cumulus_library_umls/umls/umls_builder.py @@ -1,7 +1,7 @@ import pathlib import pandas -from cumulus_library import base_table_builder, base_utils, databases +from cumulus_library import base_table_builder, base_utils, study_manifest from cumulus_library.apis import umls from cumulus_library.template_sql import base_templates @@ -145,10 +145,9 @@ def create_parquet( def prepare_queries( self, - cursor: databases.DatabaseCursor, - schema: str, + config: base_utils.StudyConfig, + manifest: study_manifest.StudyManifest, *args, - config=base_utils.StudyConfig, **kwargs, ): download_path = pathlib.Path(__file__).resolve().parent / "downloads" @@ -184,7 +183,7 @@ def prepare_queries( ) self.queries.append( base_templates.get_ctas_from_parquet_query( - schema_name=schema, + schema_name=config.schema, table_name=f"umls__{file.stem}", local_location=parquet_path / f"{file.stem}.parquet", remote_location=remote_path, @@ -193,3 +192,17 @@ def prepare_queries( ) ) progress.advance(task) + + # Section for resuable cross-study helper tables + self.queries.append( + """CREATE TABLE umls__mrrel_is_a AS +SELECT * FROM umls.mrrel +WHERE REL = 'CHD' +OR RELA in ('isa','tradename_of','has_tradename','has_basis_of_strength_substance')""" + ) + self.queries.append( + """CREATE TABLE umls__mrconso_drugs AS +SELECT * FROM umls.mrconso +WHERE SAB in ('ATC','CVX','DRUGBANK','GS','MMSL','MMX','MTHCMSFRFMTHSPL','NDDF', + 'RXNORM','SNOMEDCT_US','USP','VANDF')""" + ) diff --git a/pyproject.toml b/pyproject.toml index a4916a2..ab0c563 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,9 +1,8 @@ [project] -version = "0.2.0" name = "cumulus-library-umls" -requires-python = ">= 3.10" +requires-python = ">= 3.11" dependencies = [ - "cumulus-library >= 2.3", + "cumulus-library >= 3.0", ] description = "A Unified Medical Language System® Metathesaurus study for the Cumulus project" readme = "README.md" @@ -15,6 +14,7 @@ classifiers = [ "Programming Language :: Python :: 3", "Topic :: Software Development :: Libraries :: Python Modules", ] +dynamic=["version"] [project.optional-dependencies] dev = [ "ruff == 0.2.1", From 884ea6810d02a7cfabd86efd7b79cbf2757f49a7 Mon Sep 17 00:00:00 2001 From: Matt Garber Date: Fri, 9 Aug 2024 09:37:07 -0400 Subject: [PATCH 2/3] CI update --- .github/workflows/ci.yaml | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 738c449..f329fa3 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -9,7 +9,9 @@ jobs: - name: Set up Python uses: actions/setup-python@v5 with: - python-version: '3.10' + python-version: '3.11' + - name: Get library from main + run: pip install git+https://github.com/smart-on-fhir/cumulus-library.git - name: Install linters run: | python -m pip install --upgrade pip @@ -29,7 +31,10 @@ jobs: - name: Set up Python uses: actions/setup-python@v5 with: - python-version: '3.10' + python-version: '3.11' + + - name: Get library from main + run: pip install git+https://github.com/smart-on-fhir/cumulus-library.git - name: Install dependencies run: | From 8e78b7f2cace8a4933a944995793e41e40aea877 Mon Sep 17 00:00:00 2001 From: Matt Garber Date: Fri, 9 Aug 2024 09:43:50 -0400 Subject: [PATCH 3/3] test updates --- tests/test_umls_builder.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/tests/test_umls_builder.py b/tests/test_umls_builder.py index 2ada8db..f71c004 100644 --- a/tests/test_umls_builder.py +++ b/tests/test_umls_builder.py @@ -3,7 +3,7 @@ import pytest import responses -from cumulus_library import base_utils, databases, db_config +from cumulus_library import base_utils, databases, db_config, study_manifest from cumulus_library_umls.umls import umls_builder @@ -59,10 +59,13 @@ def test_create_query(mock_resolve, mock_responses, tmp_path): db_config.db_type = "duckdb" config = base_utils.StudyConfig( - db=databases.DuckDatabaseBackend(f"{tmp_path}/duckdb"), umls_key="123" + db=databases.DuckDatabaseBackend(f"{tmp_path}/duckdb"), + umls_key="123", + schema="main", ) builder = umls_builder.UMLSBuilder() - builder.prepare_queries(cursor=config.db.cursor(), schema="main", config=config) + manifest = study_manifest.StudyManifest() + builder.prepare_queries(config=config, manifest=manifest) expected = f"""CREATE TABLE IF NOT EXISTS umls__TESTTABLE AS SELECT TTY, CODE @@ -88,10 +91,13 @@ def test_create_query_download_exists(mock_resolve, mock_responses, tmp_path): db_config.db_type = "duckdb" config = base_utils.StudyConfig( - db=databases.DuckDatabaseBackend(f"{tmp_path}/duckdb"), umls_key="123" + db=databases.DuckDatabaseBackend(f"{tmp_path}/duckdb"), + umls_key="123", + schema="main", ) builder = umls_builder.UMLSBuilder() - builder.prepare_queries(cursor=config.db.cursor(), schema="main", config=config) + manifest = study_manifest.StudyManifest() + builder.prepare_queries(config=config, manifest=manifest) download_dirs = sorted((tmp_path / "downloads").iterdir()) assert len(download_dirs) == 1 assert "2000AA" in str(download_dirs[0])