-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
68c1899
commit 10bcd6a
Showing
9 changed files
with
292 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
name: CI | ||
on: [push] | ||
jobs: | ||
|
||
lint: | ||
runs-on: ubuntu-22.04 | ||
steps: | ||
- uses: actions/checkout@v4 | ||
- name: Set up Python | ||
uses: actions/setup-python@v5 | ||
with: | ||
python-version: '3.10' | ||
- name: Install linters | ||
run: | | ||
python -m pip install --upgrade pip | ||
pip install ".[dev]" | ||
- name: Run sqlfluff on jinja templates | ||
run: | | ||
sqlfluff lint | ||
- name: Run ruff | ||
if: success() || failure() # still run black if above checks fails | ||
run: | | ||
ruff check | ||
ruff format --check |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
name: PyPI | ||
|
||
on: | ||
release: | ||
types: [created] | ||
|
||
jobs: | ||
publish: | ||
runs-on: ubuntu-22.04 | ||
steps: | ||
- uses: actions/checkout@v3 | ||
|
||
- name: Install dependencies | ||
run: | | ||
python -m pip install --upgrade pip | ||
pip install build | ||
- name: Build | ||
run: python -m build | ||
|
||
- name: Publish | ||
uses: pypa/gh-action-pypi-publish@release/v1 | ||
with: | ||
password: ${{ secrets.PYPI_API_TOKEN }} | ||
print_hash: true |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
downloads/ | ||
generated_parquet/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
default_install_hook_types: [pre-commit, pre-push] | ||
repos: | ||
- repo: https://github.com/astral-sh/ruff-pre-commit | ||
rev: v0.2.1 | ||
hooks: | ||
- name: Ruff formatting | ||
id: ruff-format | ||
- name: Ruff linting | ||
id: ruff | ||
stages: [pre-push] | ||
|
||
- repo: https://github.com/sqlfluff/sqlfluff | ||
rev: 2.3.4 | ||
hooks: | ||
- id: sqlfluff-lint | ||
types: [] | ||
types_or: [sql,jinja] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,33 @@ | ||
# Cumulus Library UMLS | ||
# Cumulus Library UMLS | ||
|
||
An installation of the Unified Medical Language System® Metathesaurus®. Part of the [SMART on FHIR Cumulus Project](https://smarthealthit.org/cumulus-a-universal-sidecar-for-a-smart-learning-healthcare-system/) | ||
|
||
For more information, [browse the documentation](https://docs.smarthealthit.org/cumulus/library). | ||
## Usage | ||
|
||
In order to use the Metathesaurus, you'll need to get an API key for access from the National Library of Medicine, which you can sign up for [here](https://uts.nlm.nih.gov/uts/signup-login). | ||
|
||
You can then install this module by running `pip install cumulus-library-umls`. | ||
|
||
This will add a `umls` target to `cumulus-library`. You'll need to pass your | ||
API key via the `--umls` CLI flag, or set the `UMLS_API_KEY` environment variable | ||
to the key you received from NIH. | ||
|
||
This ends up being a fairly intensive operation - we download a large file, | ||
extract it, create parquet files from Athena, and then upload it. It usually | ||
takes a half hour to run. We try to preserve some of those artifacts along | ||
the way to make rebuilds faster. If you need to force recreation from scratch, the | ||
`--replace-existing` CLI flag will handle this. | ||
|
||
## Licensing details | ||
|
||
The `cumulus-library-umls` study is provided as a convenience to install the | ||
UMLS Metathesaurus, but is not shipped with the Metathesaurus dataset. It will | ||
require an API key to download the data from NIH directly. | ||
|
||
As a reminder, the | ||
[License Agreement for Use of the UMLS® Metathesaurus®](https://uts.nlm.nih.gov/uts/assets/LicenseAgreement.pdf) | ||
provides several restrictions on this usage of this data (including distributing | ||
the dataset). When you sign up for a UMLS key, you are assuming responsibility | ||
for complying with these terms, or an alternate licensing agreement with the | ||
owner of the Metathesaus data if you are provided with one. |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
study_prefix = "umls" | ||
|
||
[table_builder_config] | ||
file_names = [ | ||
"umls_builder.py" | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,121 @@ | ||
|
||
import pathlib | ||
import zipfile | ||
|
||
import pandas | ||
|
||
from cumulus_library.apis import umls | ||
from cumulus_library.template_sql import base_templates | ||
|
||
from cumulus_library import base_table_builder, base_utils, databases, study_parser | ||
|
||
|
||
class UMLSBuilder(base_table_builder.BaseTableBuilder): | ||
|
||
def get_umls_data(self, download_path, replace_existing): | ||
api = umls.UmlsApi() | ||
metadata = api.get_latest_umls_file_release(target="umls-metathesaurus-full-subset") | ||
download_required = False | ||
if not (download_path / metadata["releaseVersion"]).exists(): | ||
download_required = True | ||
if download_required or replace_existing: | ||
api.download_umls_files(target="umls-metathesaurus-full-subset", path=download_path) | ||
files = list(download_path.glob(f'./{metadata["releaseVersion"]}/META/*.ctl')) | ||
filtered_files = [] | ||
for file in files: | ||
if not file.stem.startswith('MRX'): | ||
filtered_files.append(file) | ||
return filtered_files, metadata["releaseVersion"] | ||
|
||
def sql_type_to_df_parquet_type(self, text): | ||
text = text.split('(')[0].strip(',').replace(' external','') | ||
match text: | ||
case 'char': | ||
return 'string', 'String' | ||
case 'integer': | ||
return 'Int64','Integer' | ||
case 'float': | ||
return 'float','Float' | ||
case _: | ||
raise Exception(f"'{text}' missing a type converter") | ||
|
||
def parse_ctl_file(self, contents: list[str]): | ||
datasource = None | ||
table ={'headers': [], 'dtype':{}, 'parquet_types':[]} | ||
is_col_def_section = False | ||
for line in contents: | ||
if line.startswith('infile'): | ||
datasource = line.split(' ')[1].rstrip().replace("'","") | ||
elif line.startswith('('): | ||
is_col_def_section = True | ||
line = line [1:] | ||
elif line.startswith(')'): | ||
is_col_def_section = False | ||
if is_col_def_section: | ||
line = line.strip().split('\t') | ||
df_type, parquet_type = self.sql_type_to_df_parquet_type(line[1]) | ||
table['headers'].append(line[0]) | ||
table['dtype'][line[0]] = df_type | ||
table['parquet_types'].append(parquet_type) | ||
return datasource, table | ||
|
||
def create_parquet(self, rrf_path:pathlib.Path, parquet_path: pathlib.Path, table:dict[list],replace_existing = False): | ||
if not replace_existing: | ||
if (parquet_path / f"{rrf_path.stem}.parquet").exists(): | ||
return | ||
df = pandas.read_csv( | ||
rrf_path, | ||
delimiter="|", | ||
names=table['headers'], | ||
dtype=table['dtype'], | ||
index_col = False, | ||
) | ||
df.to_parquet(parquet_path / f"{rrf_path.stem}.parquet") | ||
|
||
def prepare_queries( | ||
self, | ||
cursor: databases.DatabaseCursor, | ||
schema: str, | ||
*args, | ||
config = study_parser.StudyConfig, | ||
**kwargs | ||
): | ||
download_path = pathlib.Path(__file__).resolve().parent / 'downloads' | ||
download_path.mkdir(exist_ok=True, parents=True) | ||
files, version = self.get_umls_data(download_path, config.replace_existing) | ||
parquet_path = pathlib.Path(__file__).resolve().parent / f'generated_parquet/{version}' | ||
parquet_path.mkdir(exist_ok=True, parents=True) | ||
|
||
|
||
with base_utils.get_progress_bar() as progress: | ||
task = progress.add_task( | ||
None, | ||
total=len(files), | ||
) | ||
for file in files: | ||
with open(file) as f: | ||
datasource, table = self.parse_ctl_file(f.readlines()) | ||
progress.update(task,description=f'Processing {datasource}...') | ||
rrf_path = download_path / f'./{version}/META/{datasource}' | ||
self.create_parquet(rrf_path, parquet_path, table, replace_existing= config.replace_existing) | ||
remote_path = config.db_backend.upload_file( | ||
cursor=cursor, | ||
file=parquet_path / f"{file.stem}.parquet", | ||
study="umls", | ||
topic=file.stem, | ||
remote_filename=f"{file.stem}.parquet", | ||
replace_existing = config.replace_existing | ||
) | ||
self.queries.append( | ||
base_templates.get_ctas_from_parquet_query( | ||
schema_name=schema, | ||
table_name=f"umls__{file.stem}", | ||
# this local location needs to be modified to sometimes expect a file | ||
# rather than a dir, or the generated output needs to reflect this better | ||
local_location=parquet_path / f"{file.stem}.parquet", | ||
remote_location=remote_path, | ||
table_cols=table['headers'], | ||
remote_table_cols_types=table['parquet_types'], | ||
) | ||
) | ||
progress.advance(task) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
[project] | ||
name = "cumulus-library-umls" | ||
requires-python = ">= 3.10" | ||
dependencies = [ | ||
"cumulus-library >= 2.1", | ||
] | ||
description = "A Unified Medical Language System® Metathesaurus study for the Cumulus project" | ||
readme = "README.md" | ||
license = { text="Apache License 2.0" } | ||
keywords = ["FHIR", "SQL", "Health Informatics"] | ||
classifiers = [ | ||
"License :: OSI Approved :: Apache Software License", | ||
"Operating System :: OS Independent", | ||
"Programming Language :: Python :: 3", | ||
"Topic :: Software Development :: Libraries :: Python Modules", | ||
] | ||
dynamic=["version"] | ||
[project.optional-dependencies] | ||
dev = [ | ||
"ruff == 0.2.1", | ||
"pre-commit", | ||
] | ||
test = [ | ||
"pytest", | ||
] | ||
|
||
[project.urls] | ||
Home = "https://smarthealthit.org/cumulus-a-universal-sidecar-for-a-smart-learning-healthcare-system/" | ||
Documentation = "https://docs.smarthealthit.org/cumulus/" | ||
Source = "https://github.com/smart-on-fhir/cumulus-library-umls" | ||
|
||
[build-system] | ||
requires = ["flit_core >=3.4,<4"] | ||
build-backend = "flit_core.buildapi" | ||
|
||
[tool.flit.sdist] | ||
include = [".sqlfluff"] | ||
|
||
[tool.pytest.ini_options] | ||
minversion = "6.0" | ||
testpaths = [ | ||
"tests", | ||
] | ||
|
||
[tool.ruff] | ||
target-version = "py310" | ||
|
||
[tool.ruff.lint] | ||
select = [ | ||
"A", # prevent using keywords that clobber python builtins | ||
"B", # bugbear: security warnings | ||
"E", # pycodestyle | ||
"F", # pyflakes | ||
"I", # isort | ||
"ISC", # implicit string concatenation | ||
"PLE", # pylint errors | ||
"RUF", # the ruff developer's own rules | ||
"UP", # alert you when better syntax is available in your python version | ||
] | ||
ignore = [ | ||
# Recommended ingore from `ruff format` due to in-project conflicts with check. | ||
# It's expected that this will be fixed in the coming months. | ||
"ISC001" | ||
] |