From 3503ebc859a126a35802cb5156e73721e651c2f8 Mon Sep 17 00:00:00 2001 From: Matt Garber Date: Wed, 8 May 2024 12:15:58 -0400 Subject: [PATCH] UMLS table creation --- .github/workflows/ci.yaml | 21 +++ .github/workflows/pypi.yaml | 25 ++++ .gitignore | 137 ++++++++++++++++++ .pre-commit-config.yaml | 17 +++ README.MD | 39 ++++- cumulus_library_umls/__init__.py | 0 cumulus_library_umls/umls/manifest.toml | 6 + cumulus_library_umls/umls/umls_builder.py | 164 ++++++++++++++++++++++ pyproject.toml | 65 +++++++++ tests/__init__.py | 0 tests/test_data/2000AA.zip | Bin 0 -> 880 bytes tests/test_data/2000AA/META/TESTTABLE.RRF | 3 + tests/test_data/2000AA/META/TESTTABLE.ctl | 13 ++ tests/test_data/README.MD | 5 + tests/test_umls_builder.py | 111 +++++++++++++++ 15 files changed, 605 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/ci.yaml create mode 100644 .github/workflows/pypi.yaml create mode 100644 .gitignore create mode 100644 .pre-commit-config.yaml create mode 100644 cumulus_library_umls/__init__.py create mode 100644 cumulus_library_umls/umls/manifest.toml create mode 100644 cumulus_library_umls/umls/umls_builder.py create mode 100644 pyproject.toml create mode 100644 tests/__init__.py create mode 100644 tests/test_data/2000AA.zip create mode 100644 tests/test_data/2000AA/META/TESTTABLE.RRF create mode 100644 tests/test_data/2000AA/META/TESTTABLE.ctl create mode 100644 tests/test_data/README.MD create mode 100644 tests/test_umls_builder.py diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml new file mode 100644 index 0000000..95c87df --- /dev/null +++ b/.github/workflows/ci.yaml @@ -0,0 +1,21 @@ +name: CI +on: [push] +jobs: + + lint: + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + - name: Install linters + run: | + python -m pip install --upgrade pip + pip install ruff==0.2.1 + - name: Run ruff + if: success() || failure() # still run black if above checks fails + run: | + ruff check + ruff format --check \ No newline at end of file diff --git a/.github/workflows/pypi.yaml b/.github/workflows/pypi.yaml new file mode 100644 index 0000000..b491448 --- /dev/null +++ b/.github/workflows/pypi.yaml @@ -0,0 +1,25 @@ +name: PyPI + +on: + release: + types: [created] + +jobs: + publish: + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v3 + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install build + + - name: Build + run: python -m build + + - name: Publish + uses: pypa/gh-action-pypi-publish@release/v1 + with: + password: ${{ secrets.PYPI_API_TOKEN }} + print_hash: true \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e8c465f --- /dev/null +++ b/.gitignore @@ -0,0 +1,137 @@ +downloads/ +generated_parquet/ + +# project specific +downloads/ +generated_parquet/ +output.sql + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..ad3f57a --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,17 @@ +default_install_hook_types: [pre-commit, pre-push] +repos: + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.2.1 + hooks: + - name: Ruff formatting + id: ruff-format + - name: Ruff linting + id: ruff + stages: [pre-push] + + - repo: https://github.com/sqlfluff/sqlfluff + rev: 2.3.4 + hooks: + - id: sqlfluff-lint + types: [] + types_or: [sql,jinja] \ No newline at end of file diff --git a/README.MD b/README.MD index d5367dd..fe15503 100644 --- a/README.MD +++ b/README.MD @@ -1 +1,38 @@ -# Cumulus Library UMLS \ No newline at end of file +# Cumulus Library UMLS + +An installation of the Unified Medical Language System® Metathesaurus®. Part of the [SMART on FHIR Cumulus Project](https://smarthealthit.org/cumulus-a-universal-sidecar-for-a-smart-learning-healthcare-system/) + +For more information, [browse the documentation](https://docs.smarthealthit.org/cumulus/library). +## Usage + +In order to use the Metathesaurus, you'll need to get an API key for access from the National Library of Medicine, which you can sign up for [here](https://uts.nlm.nih.gov/uts/signup-login). + +You can then install this module by running `pip install cumulus-library-umls`. + +This will add a `umls` target to `cumulus-library`. You'll need to pass your +API key via the `--umls-key` CLI flag, or set the `UMLS_API_KEY` environment variable +to the key you received from NIH. + +This ends up being a fairly intensive operation - we download a large file, +extract it, create parquet files from Athena, and then upload it. It usually +takes a half hour to run. We try to preserve some of those artifacts along +the way to make rebuilds faster. If you need to force recreation from scratch, the +`--replace-existing` CLI flag will handle this. + +## Licensing details + +The `cumulus-library-umls` study is provided as a convenience to install the +UMLS Metathesaurus, but is not shipped with the Metathesaurus dataset. It will +require an API key to download the data from NIH directly. + +As a reminder, the +[License Agreement for Use of the UMLS® Metathesaurus®](https://uts.nlm.nih.gov/uts/assets/LicenseAgreement.pdf) +provides several restrictions on this usage of this data (including distributing +the dataset). When you sign up for a UMLS key, you are assuming responsibility +for complying with these terms, or an alternate licensing agreement with the +owner of the Metathesaus data if you are provided with one. + + +## Citations + +Bodenreider O. The Unified Medical Language System (UMLS): integrating biomedical terminology. Nucleic Acids Res. 2004 Jan 1;32(Database issue):D267-70. doi: 10.1093/nar/gkh061. PubMed PMID: 14681409; PubMed Central PMCID: PMC308795. \ No newline at end of file diff --git a/cumulus_library_umls/__init__.py b/cumulus_library_umls/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/cumulus_library_umls/umls/manifest.toml b/cumulus_library_umls/umls/manifest.toml new file mode 100644 index 0000000..002a324 --- /dev/null +++ b/cumulus_library_umls/umls/manifest.toml @@ -0,0 +1,6 @@ +study_prefix = "umls" + +[table_builder_config] +file_names = [ + "umls_builder.py" +] diff --git a/cumulus_library_umls/umls/umls_builder.py b/cumulus_library_umls/umls/umls_builder.py new file mode 100644 index 0000000..0c559fc --- /dev/null +++ b/cumulus_library_umls/umls/umls_builder.py @@ -0,0 +1,164 @@ + +import pathlib +import pandas + +from cumulus_library.apis import umls +from cumulus_library.template_sql import base_templates + +from cumulus_library import base_table_builder, base_utils, databases, study_parser + + +class UMLSBuilder(base_table_builder.BaseTableBuilder): + + def rmtree(self,root): + # just in case, if we get passed a file (like if there's an error and a + # zip file exists in the download dir) + if not root.is_dir(): + root.unlink() + else: + for p in root.iterdir(): + if p.is_dir(): + self.rmtree(p) + else: + p.unlink() + root.rmdir() + + def get_umls_data(self, + download_path: pathlib.Path, + parquet_path: pathlib.Path, + force_upload: bool, + umls_key: str): + api = umls.UmlsApi(api_key=umls_key) + metadata = api.get_latest_umls_file_release(target="umls-metathesaurus-full-subset") + download_required = False + if not (download_path / metadata["releaseVersion"]).exists(): + print('New UMLS release available, downloading & updating...') + download_required = True + for version in download_path.iterdir(): + self.rmtree(version) + for version in (parquet_path).iterdir(): + self.rmtree(version) + if download_required or force_upload: + api.download_umls_files( + target="umls-metathesaurus-full-subset", + path=download_path + ) + files = list(download_path.glob(f'./{metadata["releaseVersion"]}/META/*.ctl')) + filtered_files = [] + for file in files: + if not file.stem.startswith('MRX'): + filtered_files.append(file) + return filtered_files, download_required, metadata["releaseVersion"] + + def sql_type_to_df_parquet_type(self, text): + text = text.split('(')[0].strip(',').replace(' external','') + match text: + case 'char': + return 'string', 'String' + case 'integer': + return 'Int64','Integer' + case 'float': + return 'float','Float' + case _: + raise Exception(f"'{text}' missing a type converter") + + def parse_ctl_file(self, contents: list[str]): + datasource = None + table ={'headers': [], 'dtype':{}, 'parquet_types':[]} + is_col_def_section = False + for line in contents: + if line is None: + continue + if line.startswith('infile'): + datasource = line.split(' ')[1].rstrip().replace("'","") + elif line.startswith('('): + is_col_def_section = True + line = line [1:] + elif line.startswith(')'): + is_col_def_section = False + if is_col_def_section: + if line is not None: + line = line.strip().split('\t') + df_type, parquet_type = self.sql_type_to_df_parquet_type(line[1]) + table['headers'].append(line[0]) + table['dtype'][line[0]] = df_type + table['parquet_types'].append(parquet_type) + return datasource, table + + def create_parquet( + self, + rrf_path:pathlib.Path, + parquet_path: pathlib.Path, + table:dict[list], + force_upload = False + ): + if not force_upload: + if (parquet_path / f"{rrf_path.stem}.parquet").exists(): + return + df = pandas.read_csv( + rrf_path, + delimiter="|", + names=table['headers'], + dtype=table['dtype'], + index_col = False, + ) + df.to_parquet(parquet_path / f"{rrf_path.stem}.parquet") + + def prepare_queries( + self, + cursor: databases.DatabaseCursor, + schema: str, + *args, + config = base_utils.StudyConfig, + **kwargs + ): + download_path = pathlib.Path(__file__).resolve().parent / 'downloads' + download_path.mkdir(exist_ok=True, parents=True) + parquet_path = pathlib.Path(__file__).resolve().parent / 'generated_parquet' + parquet_path.mkdir(exist_ok=True, parents=True) + files, new_version, folder = self.get_umls_data( + download_path, + parquet_path, + config.force_upload, + config.umls_key + ) + parquet_path = parquet_path / folder + parquet_path.mkdir(exist_ok=True, parents=True) + + with base_utils.get_progress_bar() as progress: + task = progress.add_task( + None, + total=len(files), + ) + for file in files: + with open(file) as f: + datasource, table = self.parse_ctl_file(f.readlines()) + progress.update(task,description=f'Compressing {datasource}...') + rrf_path = download_path / f'./{folder}/META/{datasource}' + self.create_parquet( + rrf_path, + parquet_path, + table, + force_upload= config.force_upload + ) + progress.update(task,description=f'Uploading {datasource}...') + remote_path = config.db.upload_file( + file=parquet_path / f"{file.stem}.parquet", + study="umls", + topic=file.stem, + remote_filename=f"{file.stem}.parquet", + force_upload = config.force_upload or new_version + ) + self.queries.append( + base_templates.get_ctas_from_parquet_query( + schema_name=schema, + table_name=f"umls__{file.stem}", + # this local location needs to be modified to sometimes expect a file + # rather than a dir, or the generated output needs to reflect this better + local_location=parquet_path / f"{file.stem}.parquet", + remote_location=remote_path, + table_cols=table['headers'], + remote_table_cols_types=table['parquet_types'], + ) + ) + progress.advance(task) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..53d39ad --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,65 @@ +[project] +name = "cumulus-library-umls" +requires-python = ">= 3.10" +dependencies = [ + "cumulus-library >= 2.1", +] +description = "A Unified Medical Language System® Metathesaurus study for the Cumulus project" +readme = "README.md" +license = { text="Apache License 2.0" } +keywords = ["FHIR", "SQL", "Health Informatics"] +classifiers = [ + "License :: OSI Approved :: Apache Software License", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3", + "Topic :: Software Development :: Libraries :: Python Modules", +] +dynamic=["version"] +[project.optional-dependencies] +dev = [ + "ruff == 0.2.1", + "pre-commit", +] +test = [ + "pytest", + "responses" +] + +[project.urls] +Home = "https://smarthealthit.org/cumulus-a-universal-sidecar-for-a-smart-learning-healthcare-system/" +Documentation = "https://docs.smarthealthit.org/cumulus/" +Source = "https://github.com/smart-on-fhir/cumulus-library-umls" + +[build-system] +requires = ["flit_core >=3.4,<4"] +build-backend = "flit_core.buildapi" + +[tool.flit.sdist] +include = [".sqlfluff"] + +[tool.pytest.ini_options] +minversion = "6.0" +testpaths = [ + "tests", +] + +[tool.ruff] +target-version = "py310" + +[tool.ruff.lint] +select = [ + "A", # prevent using keywords that clobber python builtins + "B", # bugbear: security warnings + "E", # pycodestyle + "F", # pyflakes + "I", # isort + "ISC", # implicit string concatenation + "PLE", # pylint errors + "RUF", # the ruff developer's own rules + "UP", # alert you when better syntax is available in your python version +] +ignore = [ +# Recommended ingore from `ruff format` due to in-project conflicts with check. +# It's expected that this will be fixed in the coming months. + "ISC001" +] \ No newline at end of file diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_data/2000AA.zip b/tests/test_data/2000AA.zip new file mode 100644 index 0000000000000000000000000000000000000000..898844cb6a99aa5c7c2d34cd49ae3f56db9304ab GIT binary patch literal 880 zcmWIWW@h1H0D;Ga%Ok)HD8bGk!(e1!VBqMe9~#2Rz`QcwD(w>xmsW5yFtU7QWME(s z0V)lE8<1LrY5)(y0AJS-s44RbtkOUjizy(}I2fM9t%#_f{L*F%koOaaCDBX?aSaX$ zadh%=)k`kP0Xr;ufpuCe5ThBH+Gm)3*g&B5`Mvc!e9fjz$ceMs(re4g)Mr&<<30U{ z!?dOnIj+ASCu){t9IyVq_xSX8TAE@xds51NEvkGf=A_g=Z_0tQQ%-6=Khz~(_QIfG z&&!@x@h+`(584`ztY&db{@uNGeg5(~^SIheJ@W6hM~FS*Zt4^!q8%9pq<5W~ePm2E%YZ1=Mb`STzCt$9;_Ze8m) zMo?fvf&mnmrx|OP$N+;t4R0_61-XHP0TikWFhiD`v(scv;W>ZLtEOunasfF_1w993 zk~xm=U<1Y-Ba<96uIQHlS_=XK3~wDlG(0h|LJ|X7cq1EznG}!>1DOUj44N>2CSpw# zAm`yq77!DG$&g`5V>_COXo&=9JSdT1bug~rMmD|=XbZA~5ypc9AD8P{*?`VsU;)Bd LpaFkDPGtZ9)pg!@ literal 0 HcmV?d00001 diff --git a/tests/test_data/2000AA/META/TESTTABLE.RRF b/tests/test_data/2000AA/META/TESTTABLE.RRF new file mode 100644 index 0000000..fbbe655 --- /dev/null +++ b/tests/test_data/2000AA/META/TESTTABLE.RRF @@ -0,0 +1,3 @@ +TTY1|Code-1| +TTY2|Code-2| +TTY3|Code-3| \ No newline at end of file diff --git a/tests/test_data/2000AA/META/TESTTABLE.ctl b/tests/test_data/2000AA/META/TESTTABLE.ctl new file mode 100644 index 0000000..fed3d33 --- /dev/null +++ b/tests/test_data/2000AA/META/TESTTABLE.ctl @@ -0,0 +1,13 @@ +options (direct=true) +load data +characterset UTF8 length semantics char +infile 'TESTTABLE.RRF' +badfile 'TESTTABLE.bad' +discardfile 'TESTTABLE.dsc' +truncate +into table TESTTABLE +fields terminated by '|' +trailing nullcols +(TTY char(10), +CODE char(8) +) \ No newline at end of file diff --git a/tests/test_data/README.MD b/tests/test_data/README.MD new file mode 100644 index 0000000..bfc6a74 --- /dev/null +++ b/tests/test_data/README.MD @@ -0,0 +1,5 @@ +# Test data details + +2000AA.zip is generated from zipping the associated 2000AA folder. This matches +the download format, as well as the .ctl/.rrf formats produced by the UMLS +Metathesarus exports. \ No newline at end of file diff --git a/tests/test_umls_builder.py b/tests/test_umls_builder.py new file mode 100644 index 0000000..bc8a982 --- /dev/null +++ b/tests/test_umls_builder.py @@ -0,0 +1,111 @@ +import os +import pathlib +from contextlib import nullcontext as does_not_raise +from unittest import mock + +import pytest +import responses + +from cumulus_library import base_utils, databases, db_config +from cumulus_library_umls.umls import umls_builder + +AUTH_URL = "https://utslogin.nlm.nih.gov/validateUser" +RELEASE_URL = "https://uts-ws.nlm.nih.gov/releases" +DOWNLOAD_URL = "https://uts-ws.nlm.nih.gov/download" + + +@pytest.fixture +def mock_responses(): + with responses.RequestsMock(assert_all_requests_are_fired=False) as response: + with open("./tests/test_data/2000AA.zip", "rb") as download_zip: + response.add( + responses.GET, + AUTH_URL, + body="true", + status=200, + content_type="application/json", + ) + response.add( + responses.GET, + RELEASE_URL, + body="""[{ + "fileName": "2000AA.zip", + "releaseVersion": "2000AA", + "releaseDate": "2000-01-01", + "downloadUrl": "https://download.nlm.nih.gov/umls/kss/2000AA/2000AA.zip", + "releaseType": "UMLS Metathesaurus Level 0 Subset", + "product": "UMLS", + "current": true + }]""", + status=200, + content_type="application/json", + ) + response.add( + responses.GET, + DOWNLOAD_URL, + body=download_zip.read(), + status=200, + content_type="application/zip", + ) + yield response + + +@mock.patch.dict( + os.environ, + clear=True, +) +@mock.patch("pathlib.Path.resolve") +def test_create_query(mock_resolve, mock_responses, tmp_path): + mock_loc = tmp_path / 'umls_builder.py' + mock_resolve.return_value = mock_loc + + db_config.db_type="duckdb" + config = base_utils.StudyConfig( + db = databases.DuckDatabaseBackend(f"{tmp_path}/duckdb"), + umls_key='123' + ) + builder = umls_builder.UMLSBuilder() + builder.prepare_queries( + cursor= config.db.cursor(), + schema='main', + config= config + ) + expected = f"""CREATE TABLE IF NOT EXISTS umls__TESTTABLE AS SELECT + TTY, + CODE +FROM read_parquet('{tmp_path / "generated_parquet/2000AA"}/TESTTABLE.parquet/*.parquet')""" + assert expected == builder.queries[0] + + +@mock.patch.dict( + os.environ, + clear=True, +) +@mock.patch("pathlib.Path.resolve") +def test_create_query_download_exists(mock_resolve, mock_responses, tmp_path): + mock_loc = tmp_path / 'umls_builder.py' + mock_resolve.return_value = mock_loc + + prev_download_path = tmp_path / "downloads/1999AA/" + prev_download_path.mkdir(exist_ok=True, parents=True) + prev_parquet_path = tmp_path / "generated_parquet/1999AA/" + prev_parquet_path.mkdir(exist_ok=True, parents=True) + + db_config.db_type="duckdb" + config = base_utils.StudyConfig( + db = databases.DuckDatabaseBackend(f"{tmp_path}/duckdb"), + umls_key='123' + ) + builder = umls_builder.UMLSBuilder() + builder.prepare_queries( + cursor= config.db.cursor(), + schema='main', + config= config + ) + download_dirs = sorted((tmp_path / "downloads").iterdir()) + assert len(download_dirs) == 1 + assert '2000AA' in str(download_dirs[0]) + parquet_dirs = sorted((tmp_path / "generated_parquet").iterdir()) + assert len(parquet_dirs) == 1 + assert '2000AA' in str(parquet_dirs[0]) +