Skip to content

Commit

Permalink
midstream
Browse files Browse the repository at this point in the history
  • Loading branch information
dogversioning committed May 6, 2024
1 parent 68c1899 commit 10bcd6a
Show file tree
Hide file tree
Showing 9 changed files with 292 additions and 1 deletion.
24 changes: 24 additions & 0 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
name: CI
on: [push]
jobs:

lint:
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.10'
- name: Install linters
run: |
python -m pip install --upgrade pip
pip install ".[dev]"
- name: Run sqlfluff on jinja templates
run: |
sqlfluff lint
- name: Run ruff
if: success() || failure() # still run black if above checks fails
run: |
ruff check
ruff format --check
25 changes: 25 additions & 0 deletions .github/workflows/pypi.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
name: PyPI

on:
release:
types: [created]

jobs:
publish:
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v3

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install build
- name: Build
run: python -m build

- name: Publish
uses: pypa/gh-action-pypi-publish@release/v1
with:
password: ${{ secrets.PYPI_API_TOKEN }}
print_hash: true
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
downloads/
generated_parquet/
17 changes: 17 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
default_install_hook_types: [pre-commit, pre-push]
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.2.1
hooks:
- name: Ruff formatting
id: ruff-format
- name: Ruff linting
id: ruff
stages: [pre-push]

- repo: https://github.com/sqlfluff/sqlfluff
rev: 2.3.4
hooks:
- id: sqlfluff-lint
types: []
types_or: [sql,jinja]
34 changes: 33 additions & 1 deletion README.MD
Original file line number Diff line number Diff line change
@@ -1 +1,33 @@
# Cumulus Library UMLS
# Cumulus Library UMLS

An installation of the Unified Medical Language System® Metathesaurus®. Part of the [SMART on FHIR Cumulus Project](https://smarthealthit.org/cumulus-a-universal-sidecar-for-a-smart-learning-healthcare-system/)

For more information, [browse the documentation](https://docs.smarthealthit.org/cumulus/library).
## Usage

In order to use the Metathesaurus, you'll need to get an API key for access from the National Library of Medicine, which you can sign up for [here](https://uts.nlm.nih.gov/uts/signup-login).

You can then install this module by running `pip install cumulus-library-umls`.

This will add a `umls` target to `cumulus-library`. You'll need to pass your
API key via the `--umls` CLI flag, or set the `UMLS_API_KEY` environment variable
to the key you received from NIH.

This ends up being a fairly intensive operation - we download a large file,
extract it, create parquet files from Athena, and then upload it. It usually
takes a half hour to run. We try to preserve some of those artifacts along
the way to make rebuilds faster. If you need to force recreation from scratch, the
`--replace-existing` CLI flag will handle this.

## Licensing details

The `cumulus-library-umls` study is provided as a convenience to install the
UMLS Metathesaurus, but is not shipped with the Metathesaurus dataset. It will
require an API key to download the data from NIH directly.

As a reminder, the
[License Agreement for Use of the UMLS® Metathesaurus®](https://uts.nlm.nih.gov/uts/assets/LicenseAgreement.pdf)
provides several restrictions on this usage of this data (including distributing
the dataset). When you sign up for a UMLS key, you are assuming responsibility
for complying with these terms, or an alternate licensing agreement with the
owner of the Metathesaus data if you are provided with one.
Binary file not shown.
6 changes: 6 additions & 0 deletions cumulus-library-umls/umls/manifest.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
study_prefix = "umls"

[table_builder_config]
file_names = [
"umls_builder.py"
]
121 changes: 121 additions & 0 deletions cumulus-library-umls/umls/umls_builder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@

import pathlib
import zipfile

import pandas

from cumulus_library.apis import umls
from cumulus_library.template_sql import base_templates

from cumulus_library import base_table_builder, base_utils, databases, study_parser


class UMLSBuilder(base_table_builder.BaseTableBuilder):

def get_umls_data(self, download_path, replace_existing):
api = umls.UmlsApi()
metadata = api.get_latest_umls_file_release(target="umls-metathesaurus-full-subset")
download_required = False
if not (download_path / metadata["releaseVersion"]).exists():
download_required = True
if download_required or replace_existing:
api.download_umls_files(target="umls-metathesaurus-full-subset", path=download_path)
files = list(download_path.glob(f'./{metadata["releaseVersion"]}/META/*.ctl'))
filtered_files = []
for file in files:
if not file.stem.startswith('MRX'):
filtered_files.append(file)
return filtered_files, metadata["releaseVersion"]

def sql_type_to_df_parquet_type(self, text):
text = text.split('(')[0].strip(',').replace(' external','')
match text:
case 'char':
return 'string', 'String'
case 'integer':
return 'Int64','Integer'
case 'float':
return 'float','Float'
case _:
raise Exception(f"'{text}' missing a type converter")

def parse_ctl_file(self, contents: list[str]):
datasource = None
table ={'headers': [], 'dtype':{}, 'parquet_types':[]}
is_col_def_section = False
for line in contents:
if line.startswith('infile'):
datasource = line.split(' ')[1].rstrip().replace("'","")
elif line.startswith('('):
is_col_def_section = True
line = line [1:]
elif line.startswith(')'):
is_col_def_section = False
if is_col_def_section:
line = line.strip().split('\t')
df_type, parquet_type = self.sql_type_to_df_parquet_type(line[1])
table['headers'].append(line[0])
table['dtype'][line[0]] = df_type
table['parquet_types'].append(parquet_type)
return datasource, table

def create_parquet(self, rrf_path:pathlib.Path, parquet_path: pathlib.Path, table:dict[list],replace_existing = False):
if not replace_existing:
if (parquet_path / f"{rrf_path.stem}.parquet").exists():
return
df = pandas.read_csv(
rrf_path,
delimiter="|",
names=table['headers'],
dtype=table['dtype'],
index_col = False,
)
df.to_parquet(parquet_path / f"{rrf_path.stem}.parquet")

def prepare_queries(
self,
cursor: databases.DatabaseCursor,
schema: str,
*args,
config = study_parser.StudyConfig,
**kwargs
):
download_path = pathlib.Path(__file__).resolve().parent / 'downloads'
download_path.mkdir(exist_ok=True, parents=True)
files, version = self.get_umls_data(download_path, config.replace_existing)
parquet_path = pathlib.Path(__file__).resolve().parent / f'generated_parquet/{version}'
parquet_path.mkdir(exist_ok=True, parents=True)


with base_utils.get_progress_bar() as progress:
task = progress.add_task(
None,
total=len(files),
)
for file in files:
with open(file) as f:
datasource, table = self.parse_ctl_file(f.readlines())
progress.update(task,description=f'Processing {datasource}...')
rrf_path = download_path / f'./{version}/META/{datasource}'
self.create_parquet(rrf_path, parquet_path, table, replace_existing= config.replace_existing)
remote_path = config.db_backend.upload_file(
cursor=cursor,
file=parquet_path / f"{file.stem}.parquet",
study="umls",
topic=file.stem,
remote_filename=f"{file.stem}.parquet",
replace_existing = config.replace_existing
)
self.queries.append(
base_templates.get_ctas_from_parquet_query(
schema_name=schema,
table_name=f"umls__{file.stem}",
# this local location needs to be modified to sometimes expect a file
# rather than a dir, or the generated output needs to reflect this better
local_location=parquet_path / f"{file.stem}.parquet",
remote_location=remote_path,
table_cols=table['headers'],
remote_table_cols_types=table['parquet_types'],
)
)
progress.advance(task)
64 changes: 64 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
[project]
name = "cumulus-library-umls"
requires-python = ">= 3.10"
dependencies = [
"cumulus-library >= 2.1",
]
description = "A Unified Medical Language System® Metathesaurus study for the Cumulus project"
readme = "README.md"
license = { text="Apache License 2.0" }
keywords = ["FHIR", "SQL", "Health Informatics"]
classifiers = [
"License :: OSI Approved :: Apache Software License",
"Operating System :: OS Independent",
"Programming Language :: Python :: 3",
"Topic :: Software Development :: Libraries :: Python Modules",
]
dynamic=["version"]
[project.optional-dependencies]
dev = [
"ruff == 0.2.1",
"pre-commit",
]
test = [
"pytest",
]

[project.urls]
Home = "https://smarthealthit.org/cumulus-a-universal-sidecar-for-a-smart-learning-healthcare-system/"
Documentation = "https://docs.smarthealthit.org/cumulus/"
Source = "https://github.com/smart-on-fhir/cumulus-library-umls"

[build-system]
requires = ["flit_core >=3.4,<4"]
build-backend = "flit_core.buildapi"

[tool.flit.sdist]
include = [".sqlfluff"]

[tool.pytest.ini_options]
minversion = "6.0"
testpaths = [
"tests",
]

[tool.ruff]
target-version = "py310"

[tool.ruff.lint]
select = [
"A", # prevent using keywords that clobber python builtins
"B", # bugbear: security warnings
"E", # pycodestyle
"F", # pyflakes
"I", # isort
"ISC", # implicit string concatenation
"PLE", # pylint errors
"RUF", # the ruff developer's own rules
"UP", # alert you when better syntax is available in your python version
]
ignore = [
# Recommended ingore from `ruff format` due to in-project conflicts with check.
# It's expected that this will be fixed in the coming months.
"ISC001"
]

0 comments on commit 10bcd6a

Please sign in to comment.