diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml new file mode 100644 index 0000000..c9762e6 --- /dev/null +++ b/.github/workflows/ci.yaml @@ -0,0 +1,24 @@ +name: CI +on: [push] +jobs: + + lint: + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + - name: Install linters + run: | + python -m pip install --upgrade pip + pip install ".[dev]" + - name: Run sqlfluff on jinja templates + run: | + sqlfluff lint + - name: Run ruff + if: success() || failure() # still run black if above checks fails + run: | + ruff check + ruff format --check \ No newline at end of file diff --git a/.github/workflows/pypi.yaml b/.github/workflows/pypi.yaml new file mode 100644 index 0000000..b491448 --- /dev/null +++ b/.github/workflows/pypi.yaml @@ -0,0 +1,25 @@ +name: PyPI + +on: + release: + types: [created] + +jobs: + publish: + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v3 + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install build + + - name: Build + run: python -m build + + - name: Publish + uses: pypa/gh-action-pypi-publish@release/v1 + with: + password: ${{ secrets.PYPI_API_TOKEN }} + print_hash: true \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..17a8960 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +downloads/ +generated_parquet/ \ No newline at end of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..ad3f57a --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,17 @@ +default_install_hook_types: [pre-commit, pre-push] +repos: + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.2.1 + hooks: + - name: Ruff formatting + id: ruff-format + - name: Ruff linting + id: ruff + stages: [pre-push] + + - repo: https://github.com/sqlfluff/sqlfluff + rev: 2.3.4 + hooks: + - id: sqlfluff-lint + types: [] + types_or: [sql,jinja] \ No newline at end of file diff --git a/README.MD b/README.MD index d5367dd..ad61a76 100644 --- a/README.MD +++ b/README.MD @@ -1 +1,33 @@ -# Cumulus Library UMLS \ No newline at end of file +# Cumulus Library UMLS + +An installation of the Unified Medical Language System® Metathesaurus®. Part of the [SMART on FHIR Cumulus Project](https://smarthealthit.org/cumulus-a-universal-sidecar-for-a-smart-learning-healthcare-system/) + +For more information, [browse the documentation](https://docs.smarthealthit.org/cumulus/library). +## Usage + +In order to use the Metathesaurus, you'll need to get an API key for access from the National Library of Medicine, which you can sign up for [here](https://uts.nlm.nih.gov/uts/signup-login). + +You can then install this module by running `pip install cumulus-library-umls`. + +This will add a `umls` target to `cumulus-library`. You'll need to pass your +API key via the `--umls` CLI flag, or set the `UMLS_API_KEY` environment variable +to the key you received from NIH. + +This ends up being a fairly intensive operation - we download a large file, +extract it, create parquet files from Athena, and then upload it. It usually +takes a half hour to run. We try to preserve some of those artifacts along +the way to make rebuilds faster. If you need to force recreation from scratch, the +`--replace-existing` CLI flag will handle this. + +## Licensing details + +The `cumulus-library-umls` study is provided as a convenience to install the +UMLS Metathesaurus, but is not shipped with the Metathesaurus dataset. It will +require an API key to download the data from NIH directly. + +As a reminder, the +[License Agreement for Use of the UMLS® Metathesaurus®](https://uts.nlm.nih.gov/uts/assets/LicenseAgreement.pdf) +provides several restrictions on this usage of this data (including distributing +the dataset). When you sign up for a UMLS key, you are assuming responsibility +for complying with these terms, or an alternate licensing agreement with the +owner of the Metathesaus data if you are provided with one. diff --git a/cumulus-library-umls/umls/__pycache__/umls_builder.cpython-311.pyc b/cumulus-library-umls/umls/__pycache__/umls_builder.cpython-311.pyc new file mode 100644 index 0000000..ba07102 Binary files /dev/null and b/cumulus-library-umls/umls/__pycache__/umls_builder.cpython-311.pyc differ diff --git a/cumulus-library-umls/umls/manifest.toml b/cumulus-library-umls/umls/manifest.toml new file mode 100644 index 0000000..002a324 --- /dev/null +++ b/cumulus-library-umls/umls/manifest.toml @@ -0,0 +1,6 @@ +study_prefix = "umls" + +[table_builder_config] +file_names = [ + "umls_builder.py" +] diff --git a/cumulus-library-umls/umls/umls_builder.py b/cumulus-library-umls/umls/umls_builder.py new file mode 100644 index 0000000..ad6e986 --- /dev/null +++ b/cumulus-library-umls/umls/umls_builder.py @@ -0,0 +1,121 @@ + +import pathlib +import zipfile + +import pandas + +from cumulus_library.apis import umls +from cumulus_library.template_sql import base_templates + +from cumulus_library import base_table_builder, base_utils, databases, study_parser + + +class UMLSBuilder(base_table_builder.BaseTableBuilder): + + def get_umls_data(self, download_path, replace_existing): + api = umls.UmlsApi() + metadata = api.get_latest_umls_file_release(target="umls-metathesaurus-full-subset") + download_required = False + if not (download_path / metadata["releaseVersion"]).exists(): + download_required = True + if download_required or replace_existing: + api.download_umls_files(target="umls-metathesaurus-full-subset", path=download_path) + files = list(download_path.glob(f'./{metadata["releaseVersion"]}/META/*.ctl')) + filtered_files = [] + for file in files: + if not file.stem.startswith('MRX'): + filtered_files.append(file) + return filtered_files, metadata["releaseVersion"] + + def sql_type_to_df_parquet_type(self, text): + text = text.split('(')[0].strip(',').replace(' external','') + match text: + case 'char': + return 'string', 'String' + case 'integer': + return 'Int64','Integer' + case 'float': + return 'float','Float' + case _: + raise Exception(f"'{text}' missing a type converter") + + def parse_ctl_file(self, contents: list[str]): + datasource = None + table ={'headers': [], 'dtype':{}, 'parquet_types':[]} + is_col_def_section = False + for line in contents: + if line.startswith('infile'): + datasource = line.split(' ')[1].rstrip().replace("'","") + elif line.startswith('('): + is_col_def_section = True + line = line [1:] + elif line.startswith(')'): + is_col_def_section = False + if is_col_def_section: + line = line.strip().split('\t') + df_type, parquet_type = self.sql_type_to_df_parquet_type(line[1]) + table['headers'].append(line[0]) + table['dtype'][line[0]] = df_type + table['parquet_types'].append(parquet_type) + return datasource, table + + def create_parquet(self, rrf_path:pathlib.Path, parquet_path: pathlib.Path, table:dict[list],replace_existing = False): + if not replace_existing: + if (parquet_path / f"{rrf_path.stem}.parquet").exists(): + return + df = pandas.read_csv( + rrf_path, + delimiter="|", + names=table['headers'], + dtype=table['dtype'], + index_col = False, + ) + df.to_parquet(parquet_path / f"{rrf_path.stem}.parquet") + + def prepare_queries( + self, + cursor: databases.DatabaseCursor, + schema: str, + *args, + config = study_parser.StudyConfig, + **kwargs + ): + download_path = pathlib.Path(__file__).resolve().parent / 'downloads' + download_path.mkdir(exist_ok=True, parents=True) + files, version = self.get_umls_data(download_path, config.replace_existing) + parquet_path = pathlib.Path(__file__).resolve().parent / f'generated_parquet/{version}' + parquet_path.mkdir(exist_ok=True, parents=True) + + + with base_utils.get_progress_bar() as progress: + task = progress.add_task( + None, + total=len(files), + ) + for file in files: + with open(file) as f: + datasource, table = self.parse_ctl_file(f.readlines()) + progress.update(task,description=f'Processing {datasource}...') + rrf_path = download_path / f'./{version}/META/{datasource}' + self.create_parquet(rrf_path, parquet_path, table, replace_existing= config.replace_existing) + remote_path = config.db_backend.upload_file( + cursor=cursor, + file=parquet_path / f"{file.stem}.parquet", + study="umls", + topic=file.stem, + remote_filename=f"{file.stem}.parquet", + replace_existing = config.replace_existing + ) + self.queries.append( + base_templates.get_ctas_from_parquet_query( + schema_name=schema, + table_name=f"umls__{file.stem}", + # this local location needs to be modified to sometimes expect a file + # rather than a dir, or the generated output needs to reflect this better + local_location=parquet_path / f"{file.stem}.parquet", + remote_location=remote_path, + table_cols=table['headers'], + remote_table_cols_types=table['parquet_types'], + ) + ) + progress.advance(task) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..c8bbf7e --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,64 @@ +[project] +name = "cumulus-library-umls" +requires-python = ">= 3.10" +dependencies = [ + "cumulus-library >= 2.1", +] +description = "A Unified Medical Language System® Metathesaurus study for the Cumulus project" +readme = "README.md" +license = { text="Apache License 2.0" } +keywords = ["FHIR", "SQL", "Health Informatics"] +classifiers = [ + "License :: OSI Approved :: Apache Software License", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3", + "Topic :: Software Development :: Libraries :: Python Modules", +] +dynamic=["version"] +[project.optional-dependencies] +dev = [ + "ruff == 0.2.1", + "pre-commit", +] +test = [ + "pytest", +] + +[project.urls] +Home = "https://smarthealthit.org/cumulus-a-universal-sidecar-for-a-smart-learning-healthcare-system/" +Documentation = "https://docs.smarthealthit.org/cumulus/" +Source = "https://github.com/smart-on-fhir/cumulus-library-umls" + +[build-system] +requires = ["flit_core >=3.4,<4"] +build-backend = "flit_core.buildapi" + +[tool.flit.sdist] +include = [".sqlfluff"] + +[tool.pytest.ini_options] +minversion = "6.0" +testpaths = [ + "tests", +] + +[tool.ruff] +target-version = "py310" + +[tool.ruff.lint] +select = [ + "A", # prevent using keywords that clobber python builtins + "B", # bugbear: security warnings + "E", # pycodestyle + "F", # pyflakes + "I", # isort + "ISC", # implicit string concatenation + "PLE", # pylint errors + "RUF", # the ruff developer's own rules + "UP", # alert you when better syntax is available in your python version +] +ignore = [ +# Recommended ingore from `ruff format` due to in-project conflicts with check. +# It's expected that this will be fixed in the coming months. + "ISC001" +] \ No newline at end of file