midstream

smart-on-fhir · May 6, 2024 · 10bcd6a · 10bcd6a
1 parent 68c1899
commit 10bcd6a
Show file tree

Hide file tree

Showing 9 changed files with 292 additions and 1 deletion.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -0,0 +1,24 @@
+name: CI
+on: [push]
+jobs:
+
+  lint:
+    runs-on: ubuntu-22.04
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python 
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+      - name: Install linters
+        run: |
+          python -m pip install --upgrade pip
+          pip install ".[dev]"
+      - name: Run sqlfluff on jinja templates
+        run: |
+          sqlfluff lint
+      - name: Run ruff
+        if: success() || failure() # still run black if above checks fails
+        run: |
+          ruff check
+          ruff format --check
diff --git a/.github/workflows/pypi.yaml b/.github/workflows/pypi.yaml
@@ -0,0 +1,25 @@
+name: PyPI
+
+on:
+  release:
+    types: [created]
+
+jobs:
+  publish:
+    runs-on: ubuntu-22.04
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install build
+
+    - name: Build
+      run: python -m build
+
+    - name: Publish
+      uses: pypa/gh-action-pypi-publish@release/v1
+      with:
+        password: ${{ secrets.PYPI_API_TOKEN }}
+        print_hash: true
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+downloads/
+generated_parquet/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,17 @@
+default_install_hook_types: [pre-commit, pre-push]
+repos:
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.2.1
+    hooks:
+      - name: Ruff formatting
+        id: ruff-format
+      - name: Ruff linting
+        id: ruff
+        stages: [pre-push]
+
+  - repo: https://github.com/sqlfluff/sqlfluff
+    rev: 2.3.4
+    hooks:
+      - id: sqlfluff-lint
+        types: []
+        types_or: [sql,jinja]
diff --git a/README.MD b/README.MD
@@ -1 +1,33 @@
-# Cumulus Library UMLS
+# Cumulus Library UMLS
+
+An installation of the Unified Medical Language System® Metathesaurus®. Part of the [SMART on FHIR Cumulus Project](https://smarthealthit.org/cumulus-a-universal-sidecar-for-a-smart-learning-healthcare-system/)
+
+For more information, [browse the documentation](https://docs.smarthealthit.org/cumulus/library).
+## Usage
+
+In order to use the Metathesaurus, you'll need to get an API key for access from the National Library of Medicine, which you can sign up for [here](https://uts.nlm.nih.gov/uts/signup-login).
+
+You can then install this module by running `pip install cumulus-library-umls`.
+
+This will add a `umls` target to `cumulus-library`. You'll need to pass your
+API key via the `--umls` CLI flag, or set the `UMLS_API_KEY` environment variable
+to the key you received from NIH.
+
+This ends up being a fairly intensive operation - we download a large file,
+extract it, create parquet files from Athena, and then upload it. It usually
+takes a half hour to run. We try to preserve some of those artifacts along
+the way to make rebuilds faster. If you need to force recreation from scratch, the
+`--replace-existing` CLI flag will handle this.
+
+## Licensing details
+
+The `cumulus-library-umls` study is provided as a convenience to install the
+UMLS Metathesaurus, but is not shipped with the Metathesaurus dataset. It will
+require an API key to download the data from NIH directly.
+
+As a reminder, the 
+[License Agreement for Use of the UMLS® Metathesaurus®](https://uts.nlm.nih.gov/uts/assets/LicenseAgreement.pdf)
+provides several restrictions on this usage of this data (including distributing
+the dataset). When you sign up for a UMLS key, you are assuming responsibility
+for complying with these terms, or an alternate licensing agreement with the
+owner of the Metathesaus data if you are provided with one.
diff --git a/cumulus-library-umls/umls/__pycache__/umls_builder.cpython-311.pyc b/cumulus-library-umls/umls/__pycache__/umls_builder.cpython-311.pyc
diff --git a/cumulus-library-umls/umls/manifest.toml b/cumulus-library-umls/umls/manifest.toml
@@ -0,0 +1,6 @@
+study_prefix = "umls"
+
+[table_builder_config]
+file_names = [
+    "umls_builder.py"
+]
diff --git a/cumulus-library-umls/umls/umls_builder.py b/cumulus-library-umls/umls/umls_builder.py
@@ -0,0 +1,121 @@
+
+import pathlib
+import zipfile
+
+import pandas
+
+from cumulus_library.apis import umls
+from cumulus_library.template_sql import base_templates
+
+from cumulus_library import base_table_builder, base_utils, databases, study_parser
+
+
+class UMLSBuilder(base_table_builder.BaseTableBuilder):
+
+    def get_umls_data(self, download_path, replace_existing):
+        api = umls.UmlsApi()
+        metadata = api.get_latest_umls_file_release(target="umls-metathesaurus-full-subset")
+        download_required = False
+        if not (download_path / metadata["releaseVersion"]).exists():
+            download_required = True
+        if download_required or replace_existing:
+            api.download_umls_files(target="umls-metathesaurus-full-subset", path=download_path)
+        files = list(download_path.glob(f'./{metadata["releaseVersion"]}/META/*.ctl'))
+        filtered_files = []
+        for file in files:
+            if not file.stem.startswith('MRX'):
+                filtered_files.append(file)
+        return filtered_files, metadata["releaseVersion"]
+
+    def sql_type_to_df_parquet_type(self, text):
+        text = text.split('(')[0].strip(',').replace(' external','')
+        match text:
+            case 'char':
+                return 'string', 'String'
+            case 'integer':
+                return 'Int64','Integer'
+            case 'float':
+                return 'float','Float'
+            case _:
+                raise Exception(f"'{text}' missing a type converter")
+
+    def parse_ctl_file(self, contents: list[str]):
+        datasource = None
+        table ={'headers': [], 'dtype':{}, 'parquet_types':[]}
+        is_col_def_section = False
+        for line in contents:
+            if line.startswith('infile'):
+                datasource = line.split(' ')[1].rstrip().replace("'","")
+            elif line.startswith('('):
+                is_col_def_section = True
+                line = line [1:]
+            elif line.startswith(')'):
+                is_col_def_section = False
+            if is_col_def_section:
+                line = line.strip().split('\t')
+                df_type, parquet_type = self.sql_type_to_df_parquet_type(line[1])
+                table['headers'].append(line[0])
+                table['dtype'][line[0]] = df_type
+                table['parquet_types'].append(parquet_type)
+        return datasource, table
+
+    def create_parquet(self, rrf_path:pathlib.Path, parquet_path: pathlib.Path, table:dict[list],replace_existing = False):
+        if not replace_existing:
+            if (parquet_path / f"{rrf_path.stem}.parquet").exists():
+                return
+        df = pandas.read_csv(
+            rrf_path, 
+            delimiter="|", 
+            names=table['headers'], 
+            dtype=table['dtype'], 
+            index_col = False,
+        )
+        df.to_parquet(parquet_path / f"{rrf_path.stem}.parquet")
+
+    def prepare_queries(
+        self,
+        cursor: databases.DatabaseCursor,
+        schema: str,
+        *args,
+        config = study_parser.StudyConfig,
+        **kwargs
+    ):
+        download_path = pathlib.Path(__file__).resolve().parent / 'downloads'
+        download_path.mkdir(exist_ok=True, parents=True)
+        files, version = self.get_umls_data(download_path, config.replace_existing)
+        parquet_path = pathlib.Path(__file__).resolve().parent / f'generated_parquet/{version}'
+        parquet_path.mkdir(exist_ok=True, parents=True)
+
+
+        with base_utils.get_progress_bar() as progress:
+            task = progress.add_task(
+                None,
+                total=len(files),
+            )
+            for file in files:
+                with open(file) as f:
+                    datasource, table = self.parse_ctl_file(f.readlines())
+                    progress.update(task,description=f'Processing {datasource}...')
+                    rrf_path = download_path / f'./{version}/META/{datasource}'
+                    self.create_parquet(rrf_path, parquet_path, table, replace_existing= config.replace_existing)
+                    remote_path = config.db_backend.upload_file(
+                        cursor=cursor,
+                        file=parquet_path / f"{file.stem}.parquet",
+                        study="umls",
+                        topic=file.stem,
+                        remote_filename=f"{file.stem}.parquet",
+                        replace_existing = config.replace_existing
+                    )
+                    self.queries.append(
+                        base_templates.get_ctas_from_parquet_query(
+                            schema_name=schema,
+                            table_name=f"umls__{file.stem}",
+                            # this local location needs to be modified to sometimes expect a file
+                            # rather than a dir, or the generated output needs to reflect this better
+                            local_location=parquet_path / f"{file.stem}.parquet",
+                            remote_location=remote_path,
+                            table_cols=table['headers'],
+                            remote_table_cols_types=table['parquet_types'],
+                        )
+                    )
+                    progress.advance(task)
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,64 @@
+[project]
+name = "cumulus-library-umls"
+requires-python = ">= 3.10"
+dependencies = [
+    "cumulus-library >= 2.1",
+]
+description = "A Unified Medical Language System® Metathesaurus study for the Cumulus project"
+readme = "README.md"
+license = { text="Apache License 2.0" }
+keywords = ["FHIR", "SQL", "Health Informatics"]
+classifiers = [
+    "License :: OSI Approved :: Apache Software License",
+    "Operating System :: OS Independent",
+    "Programming Language :: Python :: 3",
+    "Topic :: Software Development :: Libraries :: Python Modules",
+]
+dynamic=["version"]
+[project.optional-dependencies]
+dev = [
+    "ruff == 0.2.1",
+    "pre-commit",
+]
+test = [
+    "pytest",
+]
+
+[project.urls]
+Home = "https://smarthealthit.org/cumulus-a-universal-sidecar-for-a-smart-learning-healthcare-system/"
+Documentation = "https://docs.smarthealthit.org/cumulus/"
+Source = "https://github.com/smart-on-fhir/cumulus-library-umls"
+
+[build-system]
+requires = ["flit_core >=3.4,<4"]
+build-backend = "flit_core.buildapi"
+
+[tool.flit.sdist]
+include = [".sqlfluff"]
+
+[tool.pytest.ini_options]
+minversion = "6.0"
+testpaths = [
+    "tests",
+]
+
+[tool.ruff]
+target-version = "py310"
+
+[tool.ruff.lint]
+select = [
+    "A",  # prevent using keywords that clobber python builtins
+    "B",  # bugbear: security warnings
+    "E",  # pycodestyle
+    "F",  # pyflakes
+    "I",  # isort
+    "ISC",  # implicit string concatenation
+    "PLE",  # pylint errors
+    "RUF",  # the ruff developer's own rules
+    "UP",  # alert you when better syntax is available in your python version
+]
+ignore = [
+# Recommended ingore from `ruff format` due to in-project conflicts with check.
+# It's expected that this will be fixed in the coming months.
+    "ISC001"
+]