From 8d36baaecaad2fe0da4c49c9bbf852ff9f32fd53 Mon Sep 17 00:00:00 2001
From: Matt Garber <matthew.garber@childrens.harvard.edu>
Date: Wed, 8 May 2024 12:15:58 -0400
Subject: [PATCH 1/3] UMLS table creation

---
 .github/workflows/ci.yaml                 |  40 +++++
 .github/workflows/pypi.yaml               |  25 +++
 .gitignore                                | 137 +++++++++++++++
 .pre-commit-config.yaml                   |  17 ++
 README.MD                                 |  39 ++++-
 cumulus_library_umls/__init__.py          |   0
 cumulus_library_umls/umls/manifest.toml   |   6 +
 cumulus_library_umls/umls/umls_builder.py | 195 ++++++++++++++++++++++
 pyproject.toml                            |  65 ++++++++
 tests/__init__.py                         |   0
 tests/test_data/2000AA.zip                | Bin 0 -> 880 bytes
 tests/test_data/2000AA/META/TESTTABLE.RRF |   3 +
 tests/test_data/2000AA/META/TESTTABLE.ctl |  13 ++
 tests/test_data/README.MD                 |   5 +
 tests/test_umls_builder.py                | 100 +++++++++++
 15 files changed, 644 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/ci.yaml
 create mode 100644 .github/workflows/pypi.yaml
 create mode 100644 .gitignore
 create mode 100644 .pre-commit-config.yaml
 create mode 100644 cumulus_library_umls/__init__.py
 create mode 100644 cumulus_library_umls/umls/manifest.toml
 create mode 100644 cumulus_library_umls/umls/umls_builder.py
 create mode 100644 pyproject.toml
 create mode 100644 tests/__init__.py
 create mode 100644 tests/test_data/2000AA.zip
 create mode 100644 tests/test_data/2000AA/META/TESTTABLE.RRF
 create mode 100644 tests/test_data/2000AA/META/TESTTABLE.ctl
 create mode 100644 tests/test_data/README.MD
 create mode 100644 tests/test_umls_builder.py

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
new file mode 100644
index 0000000..7dc9305
--- /dev/null
+++ b/.github/workflows/ci.yaml
@@ -0,0 +1,40 @@
+name: CI
+on: [push]
+jobs:
+
+  lint:
+    runs-on: ubuntu-22.04
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python 
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+      - name: Install linters
+        run: |
+          python -m pip install --upgrade pip
+          pip install ruff==0.2.1
+      - name: Run ruff
+        if: success() || failure() # still run black if above checks fails
+        run: |
+          ruff check
+          ruff format --check
+
+  unittest:
+    name: unit tests
+    runs-on: ubuntu-22.04
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python 
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install ".[test]"
+      - name: Test with pytest
+        run: |
+          python -m pytest
diff --git a/.github/workflows/pypi.yaml b/.github/workflows/pypi.yaml
new file mode 100644
index 0000000..39c5460
--- /dev/null
+++ b/.github/workflows/pypi.yaml
@@ -0,0 +1,25 @@
+name: PyPI
+
+on:
+  release:
+    types: [created]
+
+jobs:
+  publish:
+    runs-on: ubuntu-22.04
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install build
+
+    - name: Build
+      run: python -m build
+
+    - name: Publish
+      uses: pypa/gh-action-pypi-publish@release/v1
+      with:
+        password: ${{ secrets.PYPI_API_TOKEN }}
+        print_hash: true
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..e8c465f
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,137 @@
+downloads/
+generated_parquet/
+
+# project specific
+downloads/
+generated_parquet/
+output.sql
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..fe9656e
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,17 @@
+default_install_hook_types: [pre-commit, pre-push]
+repos:
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.2.1
+    hooks:
+      - name: Ruff formatting
+        id: ruff-format
+      - name: Ruff linting
+        id: ruff
+        stages: [pre-push]
+
+  - repo: https://github.com/sqlfluff/sqlfluff
+    rev: 2.3.4
+    hooks:
+      - id: sqlfluff-lint
+        types: []
+        types_or: [sql,jinja]
diff --git a/README.MD b/README.MD
index d5367dd..fe15503 100644
--- a/README.MD
+++ b/README.MD
@@ -1 +1,38 @@
-# Cumulus Library UMLS
\ No newline at end of file
+# Cumulus Library UMLS
+
+An installation of the Unified Medical Language System® Metathesaurus®. Part of the [SMART on FHIR Cumulus Project](https://smarthealthit.org/cumulus-a-universal-sidecar-for-a-smart-learning-healthcare-system/)
+
+For more information, [browse the documentation](https://docs.smarthealthit.org/cumulus/library).
+## Usage
+
+In order to use the Metathesaurus, you'll need to get an API key for access from the National Library of Medicine, which you can sign up for [here](https://uts.nlm.nih.gov/uts/signup-login).
+
+You can then install this module by running `pip install cumulus-library-umls`.
+
+This will add a `umls` target to `cumulus-library`. You'll need to pass your
+API key via the `--umls-key` CLI flag, or set the `UMLS_API_KEY` environment variable
+to the key you received from NIH.
+
+This ends up being a fairly intensive operation - we download a large file,
+extract it, create parquet files from Athena, and then upload it. It usually
+takes a half hour to run. We try to preserve some of those artifacts along
+the way to make rebuilds faster. If you need to force recreation from scratch, the
+`--replace-existing` CLI flag will handle this.
+
+## Licensing details
+
+The `cumulus-library-umls` study is provided as a convenience to install the
+UMLS Metathesaurus, but is not shipped with the Metathesaurus dataset. It will
+require an API key to download the data from NIH directly.
+
+As a reminder, the 
+[License Agreement for Use of the UMLS® Metathesaurus®](https://uts.nlm.nih.gov/uts/assets/LicenseAgreement.pdf)
+provides several restrictions on this usage of this data (including distributing
+the dataset). When you sign up for a UMLS key, you are assuming responsibility
+for complying with these terms, or an alternate licensing agreement with the
+owner of the Metathesaus data if you are provided with one.
+
+
+## Citations
+
+Bodenreider O. The Unified Medical Language System (UMLS): integrating biomedical terminology. Nucleic Acids Res. 2004 Jan 1;32(Database issue):D267-70. doi: 10.1093/nar/gkh061. PubMed PMID: 14681409; PubMed Central PMCID: PMC308795.
\ No newline at end of file
diff --git a/cumulus_library_umls/__init__.py b/cumulus_library_umls/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/cumulus_library_umls/umls/manifest.toml b/cumulus_library_umls/umls/manifest.toml
new file mode 100644
index 0000000..002a324
--- /dev/null
+++ b/cumulus_library_umls/umls/manifest.toml
@@ -0,0 +1,6 @@
+study_prefix = "umls"
+
+[table_builder_config]
+file_names = [
+    "umls_builder.py"
+]
diff --git a/cumulus_library_umls/umls/umls_builder.py b/cumulus_library_umls/umls/umls_builder.py
new file mode 100644
index 0000000..fb32108
--- /dev/null
+++ b/cumulus_library_umls/umls/umls_builder.py
@@ -0,0 +1,195 @@
+import pathlib
+
+import pandas
+from cumulus_library import base_table_builder, base_utils, databases
+from cumulus_library.apis import umls
+from cumulus_library.template_sql import base_templates
+
+
+class UMLSBuilder(base_table_builder.BaseTableBuilder):
+    def rmtree(self, root: pathlib.Path):
+        """Deletes a dir and all files underneath
+
+        :param root: the location at the base of the path you want to remove
+
+        TODO: replace with native pathlib.rmtree when upgrading to python 3.12
+        """
+
+        # just in case, if we get passed a file (like if there's an error and a
+        # zip file exists in the download dir)
+        if not root.is_dir():
+            root.unlink()
+        else:
+            for p in root.iterdir():
+                if p.is_dir():
+                    self.rmtree(p)
+                else:
+                    p.unlink()
+            root.rmdir()
+
+    def get_umls_data(
+        self,
+        download_path: pathlib.Path,
+        parquet_path: pathlib.Path,
+        force_upload: bool,
+        umls_key: str,
+    ) -> (list, bool, str):
+        """Fetches and extracts data from the UMLS API
+
+        :param download_path: the location to read from
+        :param parquet_path: the location output is written; only used for deletion
+            if a new dataset is downloaded
+        :param force_upload: if True, will download from UMLS regardless of data on disk
+        :param umls_key: the UMLS API key to use to auth requests
+        :returns:
+            - filtered_files - a list of files to process (excluding language tables)
+            - download_required - if True, a new UMLS release needed to be retrieved
+            - release_version - the name of the folder data was extracted to
+        """
+        api = umls.UmlsApi(api_key=umls_key)
+        metadata = api.get_latest_umls_file_release(
+            target="umls-metathesaurus-full-subset"
+        )
+        download_required = False
+        if not (download_path / metadata["releaseVersion"]).exists():
+            print("New UMLS release available, downloading & updating...")
+            download_required = True
+            for version in download_path.iterdir():
+                self.rmtree(version)
+            for version in (parquet_path).iterdir():
+                self.rmtree(version)
+        if download_required or force_upload:
+            api.download_umls_files(
+                target="umls-metathesaurus-full-subset", path=download_path
+            )
+        files = list(download_path.glob(f'./{metadata["releaseVersion"]}/META/*.ctl'))
+        filtered_files = []
+        for file in files:
+            if not file.stem.startswith("MRX"):
+                filtered_files.append(file)
+        return filtered_files, download_required, metadata["releaseVersion"]
+
+    def sql_type_to_df_parquet_type(self, text: str) -> str:
+        """Converts types extract from the MySQL .ctl definition to parquet types
+
+        :param text: the type to convert
+        :returns: the parquet type
+        """
+        text = text.split("(")[0].strip(",").replace(" external", "")
+        match text:
+            case "char":
+                return "string", "String"
+            case "integer":
+                return "Int64", "Integer"
+            case "float":
+                return "float", "Float"
+            case _:
+                raise Exception(f"'{text}' missing a type converter")
+
+    def parse_ctl_file(self, contents: list[str]) -> (str, dict):
+        """Extracts table and type definitions from a *.ctl file
+
+        :param contents: an array of strings, expected from a file.readlines call()
+        :returns:
+            - datasource - the name of the datasource for population
+            - table -a dict describing the table
+        """
+        datasource = None
+        table = {"headers": [], "dtype": {}, "parquet_types": []}
+        is_col_def_section = False
+        for line in contents:
+            if line is None:
+                continue
+            if line.startswith("infile"):
+                datasource = line.split(" ")[1].rstrip().replace("'", "")
+            elif line.startswith("("):
+                is_col_def_section = True
+                line = line[1:]
+            elif line.startswith(")"):
+                is_col_def_section = False
+            if is_col_def_section:
+                if line is not None:
+                    line = line.strip().split("\t")
+                    df_type, parquet_type = self.sql_type_to_df_parquet_type(line[1])
+                    table["headers"].append(line[0])
+                    table["dtype"][line[0]] = df_type
+                    table["parquet_types"].append(parquet_type)
+        return datasource, table
+
+    def create_parquet(
+        self,
+        rrf_path: pathlib.Path,
+        parquet_path: pathlib.Path,
+        table: dict[list],
+        force_upload=False,
+    ):
+        """Creates a parquet file from a .rrf metathesaurus file
+
+        :param rrf_path: the location of the .rrf files
+        :param parquet_path: the location to write output parquet to
+        :param table: a table definition created by parse_ctl_files
+        :param force_upload: if true, upload to a remote source regardless of what
+            already exists there
+        """
+        if not force_upload:
+            if (parquet_path / f"{rrf_path.stem}.parquet").exists():
+                return
+        df = pandas.read_csv(
+            rrf_path,
+            delimiter="|",
+            names=table["headers"],
+            dtype=table["dtype"],
+            index_col=False,
+        )
+        df.to_parquet(parquet_path / f"{rrf_path.stem}.parquet")
+
+    def prepare_queries(
+        self,
+        cursor: databases.DatabaseCursor,
+        schema: str,
+        *args,
+        config=base_utils.StudyConfig,
+        **kwargs,
+    ):
+        download_path = pathlib.Path(__file__).resolve().parent / "downloads"
+        download_path.mkdir(exist_ok=True, parents=True)
+        parquet_path = pathlib.Path(__file__).resolve().parent / "generated_parquet"
+        parquet_path.mkdir(exist_ok=True, parents=True)
+        files, new_version, folder = self.get_umls_data(
+            download_path, parquet_path, config.force_upload, config.umls_key
+        )
+        parquet_path = parquet_path / folder
+        parquet_path.mkdir(exist_ok=True, parents=True)
+
+        with base_utils.get_progress_bar() as progress:
+            task = progress.add_task(
+                None,
+                total=len(files),
+            )
+            for file in files:
+                with open(file) as f:
+                    datasource, table = self.parse_ctl_file(f.readlines())
+                    progress.update(task, description=f"Compressing {datasource}...")
+                    rrf_path = download_path / f"./{folder}/META/{datasource}"
+                    self.create_parquet(
+                        rrf_path, parquet_path, table, force_upload=config.force_upload
+                    )
+                    progress.update(task, description=f"Uploading {datasource}...")
+                    remote_path = config.db.upload_file(
+                        file=parquet_path / f"{file.stem}.parquet",
+                        study="umls",
+                        topic=file.stem,
+                        remote_filename=f"{file.stem}.parquet",
+                        force_upload=config.force_upload or new_version,
+                    )
+                    self.queries.append(
+                        base_templates.get_ctas_from_parquet_query(
+                            schema_name=schema,
+                            table_name=f"umls__{file.stem}",
+                            local_location=parquet_path / f"{file.stem}.parquet",
+                            remote_location=remote_path,
+                            table_cols=table["headers"],
+                            remote_table_cols_types=table["parquet_types"],
+                        )
+                    )
+                    progress.advance(task)
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..543de87
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,65 @@
+[project]
+name = "cumulus-library-umls"
+requires-python = ">= 3.10"
+dependencies = [
+    "cumulus-library >= 2.1",
+]
+description = "A Unified Medical Language System® Metathesaurus study for the Cumulus project"
+readme = "README.md"
+license = { text="Apache License 2.0" }
+keywords = ["FHIR", "SQL", "Health Informatics"]
+classifiers = [
+    "License :: OSI Approved :: Apache Software License",
+    "Operating System :: OS Independent",
+    "Programming Language :: Python :: 3",
+    "Topic :: Software Development :: Libraries :: Python Modules",
+]
+dynamic=["version"]
+[project.optional-dependencies]
+dev = [
+    "ruff == 0.2.1",
+    "pre-commit",
+]
+test = [
+    "pytest",
+    "responses"
+]
+
+[project.urls]
+Home = "https://smarthealthit.org/cumulus-a-universal-sidecar-for-a-smart-learning-healthcare-system/"
+Documentation = "https://docs.smarthealthit.org/cumulus/"
+Source = "https://github.com/smart-on-fhir/cumulus-library-umls"
+
+[build-system]
+requires = ["flit_core >=3.4,<4"]
+build-backend = "flit_core.buildapi"
+
+[tool.flit.sdist]
+include = [".sqlfluff"]
+
+[tool.pytest.ini_options]
+minversion = "6.0"
+testpaths = [
+    "tests",
+]
+
+[tool.ruff]
+target-version = "py310"
+
+[tool.ruff.lint]
+select = [
+    "A",  # prevent using keywords that clobber python builtins
+    "B",  # bugbear: security warnings
+    "E",  # pycodestyle
+    "F",  # pyflakes
+    "I",  # isort
+    "ISC",  # implicit string concatenation
+    "PLE",  # pylint errors
+    "RUF",  # the ruff developer's own rules
+    "UP",  # alert you when better syntax is available in your python version
+]
+ignore = [
+# Recommended ingore from `ruff format` due to in-project conflicts with check.
+# It's expected that this will be fixed in the coming months.
+    "ISC001"
+]
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_data/2000AA.zip b/tests/test_data/2000AA.zip
new file mode 100644
index 0000000000000000000000000000000000000000..898844cb6a99aa5c7c2d34cd49ae3f56db9304ab
GIT binary patch
literal 880
zcmWIWW@h1H0D;Ga%Ok)HD8bGk!(e1!VBqMe9~#2Rz`QcwD(w>xmsW5yFtU7QWME(s
z0V)lE8<1LrY5)(y0AJS-s44RbtkOUjizy(}I2fM9t%#_f{L*F%koOaaCDBX?aSaX$
zadh%=)k`kP0Xr;ufpuCe5ThBH+Gm)3*g&B5`Mvc!e9fjz$ceMs(re4g)Mr&<<30U{
z!?dOnIj+ASCu){t9IyVq_xSX8TAE@xds51NEvkGf=A_g=Z_0tQQ%-6=Khz~(_QIfG
z&&!@x@h+`(584`ztY&db{@uNGeg5(~^S<WB&Oi6}xUuaKasPS$ym<NoEf@DwrNP2l
zzEgj?KG2e>IheJ@W6hM~FS*Zt4^!q8%9pq<5W~ePm2E%YZ1=Mb`STzCt$9;_Ze8m)
zMo?fvf&mnmrx|OP$N+;t4R0_61-XHP0TikWFhiD`v(scv;W>ZLtEOunasfF_1w993
zk~xm=U<1Y-Ba<96uIQHlS_=XK3~wDlG(0h|LJ|X7cq1EznG}!>1DOUj44N>2CSpw#
zAm`yq77!DG$&g`5V>_COXo&=9JSdT1bug~rMmD|=XbZA~5ypc9AD8P{*?`VsU;)Bd
LpaFkDPGtZ9)pg!@

literal 0
HcmV?d00001

diff --git a/tests/test_data/2000AA/META/TESTTABLE.RRF b/tests/test_data/2000AA/META/TESTTABLE.RRF
new file mode 100644
index 0000000..fbbe655
--- /dev/null
+++ b/tests/test_data/2000AA/META/TESTTABLE.RRF
@@ -0,0 +1,3 @@
+TTY1|Code-1|
+TTY2|Code-2|
+TTY3|Code-3|
\ No newline at end of file
diff --git a/tests/test_data/2000AA/META/TESTTABLE.ctl b/tests/test_data/2000AA/META/TESTTABLE.ctl
new file mode 100644
index 0000000..fed3d33
--- /dev/null
+++ b/tests/test_data/2000AA/META/TESTTABLE.ctl
@@ -0,0 +1,13 @@
+options (direct=true)
+load data
+characterset UTF8 length semantics char
+infile 'TESTTABLE.RRF'
+badfile 'TESTTABLE.bad'
+discardfile 'TESTTABLE.dsc'
+truncate
+into table TESTTABLE
+fields terminated by '|'
+trailing nullcols
+(TTY	char(10),
+CODE	char(8)
+)
\ No newline at end of file
diff --git a/tests/test_data/README.MD b/tests/test_data/README.MD
new file mode 100644
index 0000000..bfc6a74
--- /dev/null
+++ b/tests/test_data/README.MD
@@ -0,0 +1,5 @@
+# Test data details
+
+2000AA.zip is generated from zipping the associated 2000AA folder. This matches
+the download format, as well as the .ctl/.rrf formats produced by the UMLS
+Metathesarus exports.
\ No newline at end of file
diff --git a/tests/test_umls_builder.py b/tests/test_umls_builder.py
new file mode 100644
index 0000000..2ada8db
--- /dev/null
+++ b/tests/test_umls_builder.py
@@ -0,0 +1,100 @@
+import os
+from unittest import mock
+
+import pytest
+import responses
+from cumulus_library import base_utils, databases, db_config
+
+from cumulus_library_umls.umls import umls_builder
+
+AUTH_URL = "https://utslogin.nlm.nih.gov/validateUser"
+RELEASE_URL = "https://uts-ws.nlm.nih.gov/releases"
+DOWNLOAD_URL = "https://uts-ws.nlm.nih.gov/download"
+
+
+@pytest.fixture
+def mock_responses():
+    with responses.RequestsMock(assert_all_requests_are_fired=False) as response:
+        with open("./tests/test_data/2000AA.zip", "rb") as download_zip:
+            response.add(
+                responses.GET,
+                AUTH_URL,
+                body="true",
+                status=200,
+                content_type="application/json",
+            )
+            response.add(
+                responses.GET,
+                RELEASE_URL,
+                body="""[{
+                    "fileName": "2000AA.zip",
+                    "releaseVersion": "2000AA",
+                    "releaseDate": "2000-01-01",
+                    "downloadUrl": "https://download.nlm.nih.gov/umls/kss/2000AA/2000AA.zip",
+                    "releaseType": "UMLS Metathesaurus Level 0 Subset",
+                    "product": "UMLS",
+                    "current": true
+                    }]""",
+                status=200,
+                content_type="application/json",
+            )
+            response.add(
+                responses.GET,
+                DOWNLOAD_URL,
+                body=download_zip.read(),
+                status=200,
+                content_type="application/zip",
+            )
+            yield response
+
+
+@mock.patch.dict(
+    os.environ,
+    clear=True,
+)
+@mock.patch("pathlib.Path.resolve")
+def test_create_query(mock_resolve, mock_responses, tmp_path):
+    mock_loc = tmp_path / "umls_builder.py"
+    mock_resolve.return_value = mock_loc
+
+    db_config.db_type = "duckdb"
+    config = base_utils.StudyConfig(
+        db=databases.DuckDatabaseBackend(f"{tmp_path}/duckdb"), umls_key="123"
+    )
+    builder = umls_builder.UMLSBuilder()
+    builder.prepare_queries(cursor=config.db.cursor(), schema="main", config=config)
+    expected = f"""CREATE TABLE IF NOT EXISTS umls__TESTTABLE AS SELECT
+    TTY,
+    CODE
+FROM read_parquet('{
+        tmp_path / "generated_parquet/2000AA"
+    }/TESTTABLE.parquet/*.parquet')"""
+    assert expected == builder.queries[0]
+
+
+@mock.patch.dict(
+    os.environ,
+    clear=True,
+)
+@mock.patch("pathlib.Path.resolve")
+def test_create_query_download_exists(mock_resolve, mock_responses, tmp_path):
+    mock_loc = tmp_path / "umls_builder.py"
+    mock_resolve.return_value = mock_loc
+
+    prev_download_path = tmp_path / "downloads/1999AA/"
+    prev_download_path.mkdir(exist_ok=True, parents=True)
+    prev_parquet_path = tmp_path / "generated_parquet/1999AA/"
+    prev_parquet_path.mkdir(exist_ok=True, parents=True)
+
+    db_config.db_type = "duckdb"
+    config = base_utils.StudyConfig(
+        db=databases.DuckDatabaseBackend(f"{tmp_path}/duckdb"), umls_key="123"
+    )
+    builder = umls_builder.UMLSBuilder()
+    builder.prepare_queries(cursor=config.db.cursor(), schema="main", config=config)
+    download_dirs = sorted((tmp_path / "downloads").iterdir())
+    assert len(download_dirs) == 1
+    assert "2000AA" in str(download_dirs[0])
+    parquet_dirs = sorted((tmp_path / "generated_parquet").iterdir())
+    assert len(parquet_dirs) == 1
+    assert "2000AA" in str(parquet_dirs[0])

From 1729cd178a5bf7aaad6a5193dc7370abe1e6414b Mon Sep 17 00:00:00 2001
From: Matt Garber <matthew.garber@childrens.harvard.edu>
Date: Wed, 8 May 2024 13:23:02 -0400
Subject: [PATCH 2/3] Docs updates, some configs

---
 .github/workflows/ci.yaml                 | 4 ++--
 .github/workflows/pypi.yaml               | 2 +-
 README.MD                                 | 4 ++--
 cumulus_library_umls/umls/umls_builder.py | 2 +-
 pyproject.toml                            | 2 +-
 5 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 7dc9305..5bc1d81 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -3,7 +3,7 @@ on: [push]
 jobs:
 
   lint:
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
       - name: Set up Python 
@@ -22,7 +22,7 @@ jobs:
 
   unittest:
     name: unit tests
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
 
diff --git a/.github/workflows/pypi.yaml b/.github/workflows/pypi.yaml
index 39c5460..18c6c53 100644
--- a/.github/workflows/pypi.yaml
+++ b/.github/workflows/pypi.yaml
@@ -8,7 +8,7 @@ jobs:
   publish:
     runs-on: ubuntu-22.04
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
 
     - name: Install dependencies
       run: |
diff --git a/README.MD b/README.MD
index fe15503..eab5159 100644
--- a/README.MD
+++ b/README.MD
@@ -1,6 +1,6 @@
 # Cumulus Library UMLS
 
-An installation of the Unified Medical Language System® Metathesaurus®. Part of the [SMART on FHIR Cumulus Project](https://smarthealthit.org/cumulus-a-universal-sidecar-for-a-smart-learning-healthcare-system/)
+An installation of the Unified Medical Language System® Metathesaurus®. Part of the [SMART on FHIR Cumulus Project](https://smarthealthit.org/cumulus)
 
 For more information, [browse the documentation](https://docs.smarthealthit.org/cumulus/library).
 ## Usage
@@ -17,7 +17,7 @@ This ends up being a fairly intensive operation - we download a large file,
 extract it, create parquet files from Athena, and then upload it. It usually
 takes a half hour to run. We try to preserve some of those artifacts along
 the way to make rebuilds faster. If you need to force recreation from scratch, the
-`--replace-existing` CLI flag will handle this.
+`--force-upload` CLI flag will handle this.
 
 ## Licensing details
 
diff --git a/cumulus_library_umls/umls/umls_builder.py b/cumulus_library_umls/umls/umls_builder.py
index fb32108..f7ab343 100644
--- a/cumulus_library_umls/umls/umls_builder.py
+++ b/cumulus_library_umls/umls/umls_builder.py
@@ -12,7 +12,7 @@ def rmtree(self, root: pathlib.Path):
 
         :param root: the location at the base of the path you want to remove
 
-        TODO: replace with native pathlib.rmtree when upgrading to python 3.12
+        TODO: replace with native pathlib.walk when upgrading to python 3.12
         """
 
         # just in case, if we get passed a file (like if there's an error and a
diff --git a/pyproject.toml b/pyproject.toml
index 543de87..fa3c856 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,7 +26,7 @@ test = [
 ]
 
 [project.urls]
-Home = "https://smarthealthit.org/cumulus-a-universal-sidecar-for-a-smart-learning-healthcare-system/"
+Home = "https://smarthealthit.org/cumulus"
 Documentation = "https://docs.smarthealthit.org/cumulus/"
 Source = "https://github.com/smart-on-fhir/cumulus-library-umls"
 

From bb6ddfa55dd5a717bded7ea989cf3e92be846035 Mon Sep 17 00:00:00 2001
From: Matt Garber <matthew.garber@childrens.harvard.edu>
Date: Wed, 8 May 2024 13:36:10 -0400
Subject: [PATCH 3/3] Added CONTRIBUTING, set lint to expected command

---
 .github/workflows/ci.yaml |  2 +-
 CONTRIBUTING.md           | 12 ++++++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)
 create mode 100644 CONTRIBUTING.md

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 5bc1d81..738c449 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -13,7 +13,7 @@ jobs:
       - name: Install linters
         run: |
           python -m pip install --upgrade pip
-          pip install ruff==0.2.1
+          pip install ".[dev]"
       - name: Run ruff
         if: success() || failure() # still run black if above checks fails
         run: |
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000..0dfce83
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,12 @@
+# Contributing to the UMLS study
+
+## Set up your dev environment
+
+To use the same dev environment as us, you'll want to run these commands:
+```sh
+pip install .[dev,test]
+pre-commit install
+```
+
+This will install dependencies & build tools,
+as well as set up an auto-formatter commit hook.