From 8d36baaecaad2fe0da4c49c9bbf852ff9f32fd53 Mon Sep 17 00:00:00 2001 From: Matt Garber Date: Wed, 8 May 2024 12:15:58 -0400 Subject: [PATCH 1/3] UMLS table creation --- .github/workflows/ci.yaml | 40 +++++ .github/workflows/pypi.yaml | 25 +++ .gitignore | 137 +++++++++++++++ .pre-commit-config.yaml | 17 ++ README.MD | 39 ++++- cumulus_library_umls/__init__.py | 0 cumulus_library_umls/umls/manifest.toml | 6 + cumulus_library_umls/umls/umls_builder.py | 195 ++++++++++++++++++++++ pyproject.toml | 65 ++++++++ tests/__init__.py | 0 tests/test_data/2000AA.zip | Bin 0 -> 880 bytes tests/test_data/2000AA/META/TESTTABLE.RRF | 3 + tests/test_data/2000AA/META/TESTTABLE.ctl | 13 ++ tests/test_data/README.MD | 5 + tests/test_umls_builder.py | 100 +++++++++++ 15 files changed, 644 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/ci.yaml create mode 100644 .github/workflows/pypi.yaml create mode 100644 .gitignore create mode 100644 .pre-commit-config.yaml create mode 100644 cumulus_library_umls/__init__.py create mode 100644 cumulus_library_umls/umls/manifest.toml create mode 100644 cumulus_library_umls/umls/umls_builder.py create mode 100644 pyproject.toml create mode 100644 tests/__init__.py create mode 100644 tests/test_data/2000AA.zip create mode 100644 tests/test_data/2000AA/META/TESTTABLE.RRF create mode 100644 tests/test_data/2000AA/META/TESTTABLE.ctl create mode 100644 tests/test_data/README.MD create mode 100644 tests/test_umls_builder.py diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml new file mode 100644 index 0000000..7dc9305 --- /dev/null +++ b/.github/workflows/ci.yaml @@ -0,0 +1,40 @@ +name: CI +on: [push] +jobs: + + lint: + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + - name: Install linters + run: | + python -m pip install --upgrade pip + pip install ruff==0.2.1 + - name: Run ruff + if: success() || failure() # still run black if above checks fails + run: | + ruff check + ruff format --check + + unittest: + name: unit tests + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install ".[test]" + - name: Test with pytest + run: | + python -m pytest diff --git a/.github/workflows/pypi.yaml b/.github/workflows/pypi.yaml new file mode 100644 index 0000000..39c5460 --- /dev/null +++ b/.github/workflows/pypi.yaml @@ -0,0 +1,25 @@ +name: PyPI + +on: + release: + types: [created] + +jobs: + publish: + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v3 + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install build + + - name: Build + run: python -m build + + - name: Publish + uses: pypa/gh-action-pypi-publish@release/v1 + with: + password: ${{ secrets.PYPI_API_TOKEN }} + print_hash: true diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e8c465f --- /dev/null +++ b/.gitignore @@ -0,0 +1,137 @@ +downloads/ +generated_parquet/ + +# project specific +downloads/ +generated_parquet/ +output.sql + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..fe9656e --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,17 @@ +default_install_hook_types: [pre-commit, pre-push] +repos: + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.2.1 + hooks: + - name: Ruff formatting + id: ruff-format + - name: Ruff linting + id: ruff + stages: [pre-push] + + - repo: https://github.com/sqlfluff/sqlfluff + rev: 2.3.4 + hooks: + - id: sqlfluff-lint + types: [] + types_or: [sql,jinja] diff --git a/README.MD b/README.MD index d5367dd..fe15503 100644 --- a/README.MD +++ b/README.MD @@ -1 +1,38 @@ -# Cumulus Library UMLS \ No newline at end of file +# Cumulus Library UMLS + +An installation of the Unified Medical Language System® Metathesaurus®. Part of the [SMART on FHIR Cumulus Project](https://smarthealthit.org/cumulus-a-universal-sidecar-for-a-smart-learning-healthcare-system/) + +For more information, [browse the documentation](https://docs.smarthealthit.org/cumulus/library). +## Usage + +In order to use the Metathesaurus, you'll need to get an API key for access from the National Library of Medicine, which you can sign up for [here](https://uts.nlm.nih.gov/uts/signup-login). + +You can then install this module by running `pip install cumulus-library-umls`. + +This will add a `umls` target to `cumulus-library`. You'll need to pass your +API key via the `--umls-key` CLI flag, or set the `UMLS_API_KEY` environment variable +to the key you received from NIH. + +This ends up being a fairly intensive operation - we download a large file, +extract it, create parquet files from Athena, and then upload it. It usually +takes a half hour to run. We try to preserve some of those artifacts along +the way to make rebuilds faster. If you need to force recreation from scratch, the +`--replace-existing` CLI flag will handle this. + +## Licensing details + +The `cumulus-library-umls` study is provided as a convenience to install the +UMLS Metathesaurus, but is not shipped with the Metathesaurus dataset. It will +require an API key to download the data from NIH directly. + +As a reminder, the +[License Agreement for Use of the UMLS® Metathesaurus®](https://uts.nlm.nih.gov/uts/assets/LicenseAgreement.pdf) +provides several restrictions on this usage of this data (including distributing +the dataset). When you sign up for a UMLS key, you are assuming responsibility +for complying with these terms, or an alternate licensing agreement with the +owner of the Metathesaus data if you are provided with one. + + +## Citations + +Bodenreider O. The Unified Medical Language System (UMLS): integrating biomedical terminology. Nucleic Acids Res. 2004 Jan 1;32(Database issue):D267-70. doi: 10.1093/nar/gkh061. PubMed PMID: 14681409; PubMed Central PMCID: PMC308795. \ No newline at end of file diff --git a/cumulus_library_umls/__init__.py b/cumulus_library_umls/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/cumulus_library_umls/umls/manifest.toml b/cumulus_library_umls/umls/manifest.toml new file mode 100644 index 0000000..002a324 --- /dev/null +++ b/cumulus_library_umls/umls/manifest.toml @@ -0,0 +1,6 @@ +study_prefix = "umls" + +[table_builder_config] +file_names = [ + "umls_builder.py" +] diff --git a/cumulus_library_umls/umls/umls_builder.py b/cumulus_library_umls/umls/umls_builder.py new file mode 100644 index 0000000..fb32108 --- /dev/null +++ b/cumulus_library_umls/umls/umls_builder.py @@ -0,0 +1,195 @@ +import pathlib + +import pandas +from cumulus_library import base_table_builder, base_utils, databases +from cumulus_library.apis import umls +from cumulus_library.template_sql import base_templates + + +class UMLSBuilder(base_table_builder.BaseTableBuilder): + def rmtree(self, root: pathlib.Path): + """Deletes a dir and all files underneath + + :param root: the location at the base of the path you want to remove + + TODO: replace with native pathlib.rmtree when upgrading to python 3.12 + """ + + # just in case, if we get passed a file (like if there's an error and a + # zip file exists in the download dir) + if not root.is_dir(): + root.unlink() + else: + for p in root.iterdir(): + if p.is_dir(): + self.rmtree(p) + else: + p.unlink() + root.rmdir() + + def get_umls_data( + self, + download_path: pathlib.Path, + parquet_path: pathlib.Path, + force_upload: bool, + umls_key: str, + ) -> (list, bool, str): + """Fetches and extracts data from the UMLS API + + :param download_path: the location to read from + :param parquet_path: the location output is written; only used for deletion + if a new dataset is downloaded + :param force_upload: if True, will download from UMLS regardless of data on disk + :param umls_key: the UMLS API key to use to auth requests + :returns: + - filtered_files - a list of files to process (excluding language tables) + - download_required - if True, a new UMLS release needed to be retrieved + - release_version - the name of the folder data was extracted to + """ + api = umls.UmlsApi(api_key=umls_key) + metadata = api.get_latest_umls_file_release( + target="umls-metathesaurus-full-subset" + ) + download_required = False + if not (download_path / metadata["releaseVersion"]).exists(): + print("New UMLS release available, downloading & updating...") + download_required = True + for version in download_path.iterdir(): + self.rmtree(version) + for version in (parquet_path).iterdir(): + self.rmtree(version) + if download_required or force_upload: + api.download_umls_files( + target="umls-metathesaurus-full-subset", path=download_path + ) + files = list(download_path.glob(f'./{metadata["releaseVersion"]}/META/*.ctl')) + filtered_files = [] + for file in files: + if not file.stem.startswith("MRX"): + filtered_files.append(file) + return filtered_files, download_required, metadata["releaseVersion"] + + def sql_type_to_df_parquet_type(self, text: str) -> str: + """Converts types extract from the MySQL .ctl definition to parquet types + + :param text: the type to convert + :returns: the parquet type + """ + text = text.split("(")[0].strip(",").replace(" external", "") + match text: + case "char": + return "string", "String" + case "integer": + return "Int64", "Integer" + case "float": + return "float", "Float" + case _: + raise Exception(f"'{text}' missing a type converter") + + def parse_ctl_file(self, contents: list[str]) -> (str, dict): + """Extracts table and type definitions from a *.ctl file + + :param contents: an array of strings, expected from a file.readlines call() + :returns: + - datasource - the name of the datasource for population + - table -a dict describing the table + """ + datasource = None + table = {"headers": [], "dtype": {}, "parquet_types": []} + is_col_def_section = False + for line in contents: + if line is None: + continue + if line.startswith("infile"): + datasource = line.split(" ")[1].rstrip().replace("'", "") + elif line.startswith("("): + is_col_def_section = True + line = line[1:] + elif line.startswith(")"): + is_col_def_section = False + if is_col_def_section: + if line is not None: + line = line.strip().split("\t") + df_type, parquet_type = self.sql_type_to_df_parquet_type(line[1]) + table["headers"].append(line[0]) + table["dtype"][line[0]] = df_type + table["parquet_types"].append(parquet_type) + return datasource, table + + def create_parquet( + self, + rrf_path: pathlib.Path, + parquet_path: pathlib.Path, + table: dict[list], + force_upload=False, + ): + """Creates a parquet file from a .rrf metathesaurus file + + :param rrf_path: the location of the .rrf files + :param parquet_path: the location to write output parquet to + :param table: a table definition created by parse_ctl_files + :param force_upload: if true, upload to a remote source regardless of what + already exists there + """ + if not force_upload: + if (parquet_path / f"{rrf_path.stem}.parquet").exists(): + return + df = pandas.read_csv( + rrf_path, + delimiter="|", + names=table["headers"], + dtype=table["dtype"], + index_col=False, + ) + df.to_parquet(parquet_path / f"{rrf_path.stem}.parquet") + + def prepare_queries( + self, + cursor: databases.DatabaseCursor, + schema: str, + *args, + config=base_utils.StudyConfig, + **kwargs, + ): + download_path = pathlib.Path(__file__).resolve().parent / "downloads" + download_path.mkdir(exist_ok=True, parents=True) + parquet_path = pathlib.Path(__file__).resolve().parent / "generated_parquet" + parquet_path.mkdir(exist_ok=True, parents=True) + files, new_version, folder = self.get_umls_data( + download_path, parquet_path, config.force_upload, config.umls_key + ) + parquet_path = parquet_path / folder + parquet_path.mkdir(exist_ok=True, parents=True) + + with base_utils.get_progress_bar() as progress: + task = progress.add_task( + None, + total=len(files), + ) + for file in files: + with open(file) as f: + datasource, table = self.parse_ctl_file(f.readlines()) + progress.update(task, description=f"Compressing {datasource}...") + rrf_path = download_path / f"./{folder}/META/{datasource}" + self.create_parquet( + rrf_path, parquet_path, table, force_upload=config.force_upload + ) + progress.update(task, description=f"Uploading {datasource}...") + remote_path = config.db.upload_file( + file=parquet_path / f"{file.stem}.parquet", + study="umls", + topic=file.stem, + remote_filename=f"{file.stem}.parquet", + force_upload=config.force_upload or new_version, + ) + self.queries.append( + base_templates.get_ctas_from_parquet_query( + schema_name=schema, + table_name=f"umls__{file.stem}", + local_location=parquet_path / f"{file.stem}.parquet", + remote_location=remote_path, + table_cols=table["headers"], + remote_table_cols_types=table["parquet_types"], + ) + ) + progress.advance(task) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..543de87 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,65 @@ +[project] +name = "cumulus-library-umls" +requires-python = ">= 3.10" +dependencies = [ + "cumulus-library >= 2.1", +] +description = "A Unified Medical Language System® Metathesaurus study for the Cumulus project" +readme = "README.md" +license = { text="Apache License 2.0" } +keywords = ["FHIR", "SQL", "Health Informatics"] +classifiers = [ + "License :: OSI Approved :: Apache Software License", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3", + "Topic :: Software Development :: Libraries :: Python Modules", +] +dynamic=["version"] +[project.optional-dependencies] +dev = [ + "ruff == 0.2.1", + "pre-commit", +] +test = [ + "pytest", + "responses" +] + +[project.urls] +Home = "https://smarthealthit.org/cumulus-a-universal-sidecar-for-a-smart-learning-healthcare-system/" +Documentation = "https://docs.smarthealthit.org/cumulus/" +Source = "https://github.com/smart-on-fhir/cumulus-library-umls" + +[build-system] +requires = ["flit_core >=3.4,<4"] +build-backend = "flit_core.buildapi" + +[tool.flit.sdist] +include = [".sqlfluff"] + +[tool.pytest.ini_options] +minversion = "6.0" +testpaths = [ + "tests", +] + +[tool.ruff] +target-version = "py310" + +[tool.ruff.lint] +select = [ + "A", # prevent using keywords that clobber python builtins + "B", # bugbear: security warnings + "E", # pycodestyle + "F", # pyflakes + "I", # isort + "ISC", # implicit string concatenation + "PLE", # pylint errors + "RUF", # the ruff developer's own rules + "UP", # alert you when better syntax is available in your python version +] +ignore = [ +# Recommended ingore from `ruff format` due to in-project conflicts with check. +# It's expected that this will be fixed in the coming months. + "ISC001" +] diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_data/2000AA.zip b/tests/test_data/2000AA.zip new file mode 100644 index 0000000000000000000000000000000000000000..898844cb6a99aa5c7c2d34cd49ae3f56db9304ab GIT binary patch literal 880 zcmWIWW@h1H0D;Ga%Ok)HD8bGk!(e1!VBqMe9~#2Rz`QcwD(w>xmsW5yFtU7QWME(s z0V)lE8<1LrY5)(y0AJS-s44RbtkOUjizy(}I2fM9t%#_f{L*F%koOaaCDBX?aSaX$ zadh%=)k`kP0Xr;ufpuCe5ThBH+Gm)3*g&B5`Mvc!e9fjz$ceMs(re4g)Mr&<<30U{ z!?dOnIj+ASCu){t9IyVq_xSX8TAE@xds51NEvkGf=A_g=Z_0tQQ%-6=Khz~(_QIfG z&&!@x@h+`(584`ztY&db{@uNGeg5(~^SIheJ@W6hM~FS*Zt4^!q8%9pq<5W~ePm2E%YZ1=Mb`STzCt$9;_Ze8m) zMo?fvf&mnmrx|OP$N+;t4R0_61-XHP0TikWFhiD`v(scv;W>ZLtEOunasfF_1w993 zk~xm=U<1Y-Ba<96uIQHlS_=XK3~wDlG(0h|LJ|X7cq1EznG}!>1DOUj44N>2CSpw# zAm`yq77!DG$&g`5V>_COXo&=9JSdT1bug~rMmD|=XbZA~5ypc9AD8P{*?`VsU;)Bd LpaFkDPGtZ9)pg!@ literal 0 HcmV?d00001 diff --git a/tests/test_data/2000AA/META/TESTTABLE.RRF b/tests/test_data/2000AA/META/TESTTABLE.RRF new file mode 100644 index 0000000..fbbe655 --- /dev/null +++ b/tests/test_data/2000AA/META/TESTTABLE.RRF @@ -0,0 +1,3 @@ +TTY1|Code-1| +TTY2|Code-2| +TTY3|Code-3| \ No newline at end of file diff --git a/tests/test_data/2000AA/META/TESTTABLE.ctl b/tests/test_data/2000AA/META/TESTTABLE.ctl new file mode 100644 index 0000000..fed3d33 --- /dev/null +++ b/tests/test_data/2000AA/META/TESTTABLE.ctl @@ -0,0 +1,13 @@ +options (direct=true) +load data +characterset UTF8 length semantics char +infile 'TESTTABLE.RRF' +badfile 'TESTTABLE.bad' +discardfile 'TESTTABLE.dsc' +truncate +into table TESTTABLE +fields terminated by '|' +trailing nullcols +(TTY char(10), +CODE char(8) +) \ No newline at end of file diff --git a/tests/test_data/README.MD b/tests/test_data/README.MD new file mode 100644 index 0000000..bfc6a74 --- /dev/null +++ b/tests/test_data/README.MD @@ -0,0 +1,5 @@ +# Test data details + +2000AA.zip is generated from zipping the associated 2000AA folder. This matches +the download format, as well as the .ctl/.rrf formats produced by the UMLS +Metathesarus exports. \ No newline at end of file diff --git a/tests/test_umls_builder.py b/tests/test_umls_builder.py new file mode 100644 index 0000000..2ada8db --- /dev/null +++ b/tests/test_umls_builder.py @@ -0,0 +1,100 @@ +import os +from unittest import mock + +import pytest +import responses +from cumulus_library import base_utils, databases, db_config + +from cumulus_library_umls.umls import umls_builder + +AUTH_URL = "https://utslogin.nlm.nih.gov/validateUser" +RELEASE_URL = "https://uts-ws.nlm.nih.gov/releases" +DOWNLOAD_URL = "https://uts-ws.nlm.nih.gov/download" + + +@pytest.fixture +def mock_responses(): + with responses.RequestsMock(assert_all_requests_are_fired=False) as response: + with open("./tests/test_data/2000AA.zip", "rb") as download_zip: + response.add( + responses.GET, + AUTH_URL, + body="true", + status=200, + content_type="application/json", + ) + response.add( + responses.GET, + RELEASE_URL, + body="""[{ + "fileName": "2000AA.zip", + "releaseVersion": "2000AA", + "releaseDate": "2000-01-01", + "downloadUrl": "https://download.nlm.nih.gov/umls/kss/2000AA/2000AA.zip", + "releaseType": "UMLS Metathesaurus Level 0 Subset", + "product": "UMLS", + "current": true + }]""", + status=200, + content_type="application/json", + ) + response.add( + responses.GET, + DOWNLOAD_URL, + body=download_zip.read(), + status=200, + content_type="application/zip", + ) + yield response + + +@mock.patch.dict( + os.environ, + clear=True, +) +@mock.patch("pathlib.Path.resolve") +def test_create_query(mock_resolve, mock_responses, tmp_path): + mock_loc = tmp_path / "umls_builder.py" + mock_resolve.return_value = mock_loc + + db_config.db_type = "duckdb" + config = base_utils.StudyConfig( + db=databases.DuckDatabaseBackend(f"{tmp_path}/duckdb"), umls_key="123" + ) + builder = umls_builder.UMLSBuilder() + builder.prepare_queries(cursor=config.db.cursor(), schema="main", config=config) + expected = f"""CREATE TABLE IF NOT EXISTS umls__TESTTABLE AS SELECT + TTY, + CODE +FROM read_parquet('{ + tmp_path / "generated_parquet/2000AA" + }/TESTTABLE.parquet/*.parquet')""" + assert expected == builder.queries[0] + + +@mock.patch.dict( + os.environ, + clear=True, +) +@mock.patch("pathlib.Path.resolve") +def test_create_query_download_exists(mock_resolve, mock_responses, tmp_path): + mock_loc = tmp_path / "umls_builder.py" + mock_resolve.return_value = mock_loc + + prev_download_path = tmp_path / "downloads/1999AA/" + prev_download_path.mkdir(exist_ok=True, parents=True) + prev_parquet_path = tmp_path / "generated_parquet/1999AA/" + prev_parquet_path.mkdir(exist_ok=True, parents=True) + + db_config.db_type = "duckdb" + config = base_utils.StudyConfig( + db=databases.DuckDatabaseBackend(f"{tmp_path}/duckdb"), umls_key="123" + ) + builder = umls_builder.UMLSBuilder() + builder.prepare_queries(cursor=config.db.cursor(), schema="main", config=config) + download_dirs = sorted((tmp_path / "downloads").iterdir()) + assert len(download_dirs) == 1 + assert "2000AA" in str(download_dirs[0]) + parquet_dirs = sorted((tmp_path / "generated_parquet").iterdir()) + assert len(parquet_dirs) == 1 + assert "2000AA" in str(parquet_dirs[0]) From 1729cd178a5bf7aaad6a5193dc7370abe1e6414b Mon Sep 17 00:00:00 2001 From: Matt Garber Date: Wed, 8 May 2024 13:23:02 -0400 Subject: [PATCH 2/3] Docs updates, some configs --- .github/workflows/ci.yaml | 4 ++-- .github/workflows/pypi.yaml | 2 +- README.MD | 4 ++-- cumulus_library_umls/umls/umls_builder.py | 2 +- pyproject.toml | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 7dc9305..5bc1d81 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -3,7 +3,7 @@ on: [push] jobs: lint: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Set up Python @@ -22,7 +22,7 @@ jobs: unittest: name: unit tests - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 diff --git a/.github/workflows/pypi.yaml b/.github/workflows/pypi.yaml index 39c5460..18c6c53 100644 --- a/.github/workflows/pypi.yaml +++ b/.github/workflows/pypi.yaml @@ -8,7 +8,7 @@ jobs: publish: runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Install dependencies run: | diff --git a/README.MD b/README.MD index fe15503..eab5159 100644 --- a/README.MD +++ b/README.MD @@ -1,6 +1,6 @@ # Cumulus Library UMLS -An installation of the Unified Medical Language System® Metathesaurus®. Part of the [SMART on FHIR Cumulus Project](https://smarthealthit.org/cumulus-a-universal-sidecar-for-a-smart-learning-healthcare-system/) +An installation of the Unified Medical Language System® Metathesaurus®. Part of the [SMART on FHIR Cumulus Project](https://smarthealthit.org/cumulus) For more information, [browse the documentation](https://docs.smarthealthit.org/cumulus/library). ## Usage @@ -17,7 +17,7 @@ This ends up being a fairly intensive operation - we download a large file, extract it, create parquet files from Athena, and then upload it. It usually takes a half hour to run. We try to preserve some of those artifacts along the way to make rebuilds faster. If you need to force recreation from scratch, the -`--replace-existing` CLI flag will handle this. +`--force-upload` CLI flag will handle this. ## Licensing details diff --git a/cumulus_library_umls/umls/umls_builder.py b/cumulus_library_umls/umls/umls_builder.py index fb32108..f7ab343 100644 --- a/cumulus_library_umls/umls/umls_builder.py +++ b/cumulus_library_umls/umls/umls_builder.py @@ -12,7 +12,7 @@ def rmtree(self, root: pathlib.Path): :param root: the location at the base of the path you want to remove - TODO: replace with native pathlib.rmtree when upgrading to python 3.12 + TODO: replace with native pathlib.walk when upgrading to python 3.12 """ # just in case, if we get passed a file (like if there's an error and a diff --git a/pyproject.toml b/pyproject.toml index 543de87..fa3c856 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ test = [ ] [project.urls] -Home = "https://smarthealthit.org/cumulus-a-universal-sidecar-for-a-smart-learning-healthcare-system/" +Home = "https://smarthealthit.org/cumulus" Documentation = "https://docs.smarthealthit.org/cumulus/" Source = "https://github.com/smart-on-fhir/cumulus-library-umls" From bb6ddfa55dd5a717bded7ea989cf3e92be846035 Mon Sep 17 00:00:00 2001 From: Matt Garber Date: Wed, 8 May 2024 13:36:10 -0400 Subject: [PATCH 3/3] Added CONTRIBUTING, set lint to expected command --- .github/workflows/ci.yaml | 2 +- CONTRIBUTING.md | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) create mode 100644 CONTRIBUTING.md diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 5bc1d81..738c449 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -13,7 +13,7 @@ jobs: - name: Install linters run: | python -m pip install --upgrade pip - pip install ruff==0.2.1 + pip install ".[dev]" - name: Run ruff if: success() || failure() # still run black if above checks fails run: | diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..0dfce83 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,12 @@ +# Contributing to the UMLS study + +## Set up your dev environment + +To use the same dev environment as us, you'll want to run these commands: +```sh +pip install .[dev,test] +pre-commit install +``` + +This will install dependencies & build tools, +as well as set up an auto-formatter commit hook.