From 3503ebc859a126a35802cb5156e73721e651c2f8 Mon Sep 17 00:00:00 2001
From: Matt Garber <matthew.garber@childrens.harvard.edu>
Date: Wed, 8 May 2024 12:15:58 -0400
Subject: [PATCH] UMLS table creation

---
 .github/workflows/ci.yaml                 |  21 +++
 .github/workflows/pypi.yaml               |  25 ++++
 .gitignore                                | 137 ++++++++++++++++++
 .pre-commit-config.yaml                   |  17 +++
 README.MD                                 |  39 ++++-
 cumulus_library_umls/__init__.py          |   0
 cumulus_library_umls/umls/manifest.toml   |   6 +
 cumulus_library_umls/umls/umls_builder.py | 164 ++++++++++++++++++++++
 pyproject.toml                            |  65 +++++++++
 tests/__init__.py                         |   0
 tests/test_data/2000AA.zip                | Bin 0 -> 880 bytes
 tests/test_data/2000AA/META/TESTTABLE.RRF |   3 +
 tests/test_data/2000AA/META/TESTTABLE.ctl |  13 ++
 tests/test_data/README.MD                 |   5 +
 tests/test_umls_builder.py                | 111 +++++++++++++++
 15 files changed, 605 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/ci.yaml
 create mode 100644 .github/workflows/pypi.yaml
 create mode 100644 .gitignore
 create mode 100644 .pre-commit-config.yaml
 create mode 100644 cumulus_library_umls/__init__.py
 create mode 100644 cumulus_library_umls/umls/manifest.toml
 create mode 100644 cumulus_library_umls/umls/umls_builder.py
 create mode 100644 pyproject.toml
 create mode 100644 tests/__init__.py
 create mode 100644 tests/test_data/2000AA.zip
 create mode 100644 tests/test_data/2000AA/META/TESTTABLE.RRF
 create mode 100644 tests/test_data/2000AA/META/TESTTABLE.ctl
 create mode 100644 tests/test_data/README.MD
 create mode 100644 tests/test_umls_builder.py

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
new file mode 100644
index 0000000..95c87df
--- /dev/null
+++ b/.github/workflows/ci.yaml
@@ -0,0 +1,21 @@
+name: CI
+on: [push]
+jobs:
+
+  lint:
+    runs-on: ubuntu-22.04
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python 
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+      - name: Install linters
+        run: |
+          python -m pip install --upgrade pip
+          pip install ruff==0.2.1
+      - name: Run ruff
+        if: success() || failure() # still run black if above checks fails
+        run: |
+          ruff check
+          ruff format --check
\ No newline at end of file
diff --git a/.github/workflows/pypi.yaml b/.github/workflows/pypi.yaml
new file mode 100644
index 0000000..b491448
--- /dev/null
+++ b/.github/workflows/pypi.yaml
@@ -0,0 +1,25 @@
+name: PyPI
+
+on:
+  release:
+    types: [created]
+
+jobs:
+  publish:
+    runs-on: ubuntu-22.04
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install build
+
+    - name: Build
+      run: python -m build
+
+    - name: Publish
+      uses: pypa/gh-action-pypi-publish@release/v1
+      with:
+        password: ${{ secrets.PYPI_API_TOKEN }}
+        print_hash: true
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..e8c465f
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,137 @@
+downloads/
+generated_parquet/
+
+# project specific
+downloads/
+generated_parquet/
+output.sql
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..ad3f57a
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,17 @@
+default_install_hook_types: [pre-commit, pre-push]
+repos:
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.2.1
+    hooks:
+      - name: Ruff formatting
+        id: ruff-format
+      - name: Ruff linting
+        id: ruff
+        stages: [pre-push]
+
+  - repo: https://github.com/sqlfluff/sqlfluff
+    rev: 2.3.4
+    hooks:
+      - id: sqlfluff-lint
+        types: []
+        types_or: [sql,jinja]
\ No newline at end of file
diff --git a/README.MD b/README.MD
index d5367dd..fe15503 100644
--- a/README.MD
+++ b/README.MD
@@ -1 +1,38 @@
-# Cumulus Library UMLS
\ No newline at end of file
+# Cumulus Library UMLS
+
+An installation of the Unified Medical Language System® Metathesaurus®. Part of the [SMART on FHIR Cumulus Project](https://smarthealthit.org/cumulus-a-universal-sidecar-for-a-smart-learning-healthcare-system/)
+
+For more information, [browse the documentation](https://docs.smarthealthit.org/cumulus/library).
+## Usage
+
+In order to use the Metathesaurus, you'll need to get an API key for access from the National Library of Medicine, which you can sign up for [here](https://uts.nlm.nih.gov/uts/signup-login).
+
+You can then install this module by running `pip install cumulus-library-umls`.
+
+This will add a `umls` target to `cumulus-library`. You'll need to pass your
+API key via the `--umls-key` CLI flag, or set the `UMLS_API_KEY` environment variable
+to the key you received from NIH.
+
+This ends up being a fairly intensive operation - we download a large file,
+extract it, create parquet files from Athena, and then upload it. It usually
+takes a half hour to run. We try to preserve some of those artifacts along
+the way to make rebuilds faster. If you need to force recreation from scratch, the
+`--replace-existing` CLI flag will handle this.
+
+## Licensing details
+
+The `cumulus-library-umls` study is provided as a convenience to install the
+UMLS Metathesaurus, but is not shipped with the Metathesaurus dataset. It will
+require an API key to download the data from NIH directly.
+
+As a reminder, the 
+[License Agreement for Use of the UMLS® Metathesaurus®](https://uts.nlm.nih.gov/uts/assets/LicenseAgreement.pdf)
+provides several restrictions on this usage of this data (including distributing
+the dataset). When you sign up for a UMLS key, you are assuming responsibility
+for complying with these terms, or an alternate licensing agreement with the
+owner of the Metathesaus data if you are provided with one.
+
+
+## Citations
+
+Bodenreider O. The Unified Medical Language System (UMLS): integrating biomedical terminology. Nucleic Acids Res. 2004 Jan 1;32(Database issue):D267-70. doi: 10.1093/nar/gkh061. PubMed PMID: 14681409; PubMed Central PMCID: PMC308795.
\ No newline at end of file
diff --git a/cumulus_library_umls/__init__.py b/cumulus_library_umls/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/cumulus_library_umls/umls/manifest.toml b/cumulus_library_umls/umls/manifest.toml
new file mode 100644
index 0000000..002a324
--- /dev/null
+++ b/cumulus_library_umls/umls/manifest.toml
@@ -0,0 +1,6 @@
+study_prefix = "umls"
+
+[table_builder_config]
+file_names = [
+    "umls_builder.py"
+]
diff --git a/cumulus_library_umls/umls/umls_builder.py b/cumulus_library_umls/umls/umls_builder.py
new file mode 100644
index 0000000..0c559fc
--- /dev/null
+++ b/cumulus_library_umls/umls/umls_builder.py
@@ -0,0 +1,164 @@
+
+import pathlib
+import pandas
+
+from cumulus_library.apis import umls
+from cumulus_library.template_sql import base_templates
+
+from cumulus_library import base_table_builder, base_utils, databases, study_parser
+
+
+class UMLSBuilder(base_table_builder.BaseTableBuilder):
+
+    def rmtree(self,root):
+        # just in case, if we get passed a file (like if there's an error and a 
+        # zip file exists in the download dir)
+        if not root.is_dir():
+            root.unlink()
+        else:
+            for p in root.iterdir():
+                if p.is_dir():
+                    self.rmtree(p)
+                else:
+                    p.unlink()
+            root.rmdir()
+
+    def get_umls_data(self,
+        download_path: pathlib.Path,
+        parquet_path: pathlib.Path, 
+        force_upload: bool,
+        umls_key: str):
+        api = umls.UmlsApi(api_key=umls_key)
+        metadata = api.get_latest_umls_file_release(target="umls-metathesaurus-full-subset")
+        download_required = False
+        if not (download_path / metadata["releaseVersion"]).exists():
+            print('New UMLS release available, downloading & updating...')
+            download_required = True
+            for version in download_path.iterdir():
+                self.rmtree(version)
+            for version in (parquet_path).iterdir():
+                self.rmtree(version)
+        if download_required or force_upload:
+            api.download_umls_files(
+                target="umls-metathesaurus-full-subset",
+                path=download_path
+            )
+        files = list(download_path.glob(f'./{metadata["releaseVersion"]}/META/*.ctl'))
+        filtered_files = []
+        for file in files:
+            if not file.stem.startswith('MRX'):
+                filtered_files.append(file)
+        return filtered_files, download_required, metadata["releaseVersion"]
+
+    def sql_type_to_df_parquet_type(self, text):
+        text = text.split('(')[0].strip(',').replace(' external','')
+        match text:
+            case 'char':
+                return 'string', 'String'
+            case 'integer':
+                return 'Int64','Integer'
+            case 'float':
+                return 'float','Float'
+            case _:
+                raise Exception(f"'{text}' missing a type converter")
+
+    def parse_ctl_file(self, contents: list[str]):
+        datasource = None
+        table ={'headers': [], 'dtype':{}, 'parquet_types':[]}
+        is_col_def_section = False
+        for line in contents:
+            if line is None:
+                continue
+            if line.startswith('infile'):
+                datasource = line.split(' ')[1].rstrip().replace("'","")
+            elif line.startswith('('):
+                is_col_def_section = True
+                line = line [1:]
+            elif line.startswith(')'):
+                is_col_def_section = False
+            if is_col_def_section:
+                if line is not None:
+                    line = line.strip().split('\t')
+                    df_type, parquet_type = self.sql_type_to_df_parquet_type(line[1])
+                    table['headers'].append(line[0])
+                    table['dtype'][line[0]] = df_type
+                    table['parquet_types'].append(parquet_type)
+        return datasource, table
+
+    def create_parquet(
+        self, 
+        rrf_path:pathlib.Path, 
+        parquet_path: pathlib.Path, 
+        table:dict[list],
+        force_upload = False
+    ):
+        if not force_upload:
+            if (parquet_path / f"{rrf_path.stem}.parquet").exists():
+                return
+        df = pandas.read_csv(
+            rrf_path, 
+            delimiter="|", 
+            names=table['headers'], 
+            dtype=table['dtype'], 
+            index_col = False,
+        )
+        df.to_parquet(parquet_path / f"{rrf_path.stem}.parquet")
+
+    def prepare_queries(
+        self,
+        cursor: databases.DatabaseCursor,
+        schema: str,
+        *args,
+        config = base_utils.StudyConfig,
+        **kwargs
+    ):
+        download_path = pathlib.Path(__file__).resolve().parent / 'downloads'
+        download_path.mkdir(exist_ok=True, parents=True)
+        parquet_path = pathlib.Path(__file__).resolve().parent / 'generated_parquet'
+        parquet_path.mkdir(exist_ok=True, parents=True)
+        files, new_version, folder = self.get_umls_data(
+            download_path,
+            parquet_path, 
+            config.force_upload, 
+            config.umls_key
+        )
+        parquet_path = parquet_path / folder
+        parquet_path.mkdir(exist_ok=True, parents=True)
+
+        with base_utils.get_progress_bar() as progress:
+            task = progress.add_task(
+                None,
+                total=len(files),
+            )
+            for file in files:
+                with open(file) as f:
+                    datasource, table = self.parse_ctl_file(f.readlines())
+                    progress.update(task,description=f'Compressing {datasource}...')
+                    rrf_path = download_path / f'./{folder}/META/{datasource}'
+                    self.create_parquet(
+                        rrf_path, 
+                        parquet_path, 
+                        table, 
+                        force_upload= config.force_upload
+                    )
+                    progress.update(task,description=f'Uploading {datasource}...')
+                    remote_path = config.db.upload_file(
+                        file=parquet_path / f"{file.stem}.parquet",
+                        study="umls",
+                        topic=file.stem,
+                        remote_filename=f"{file.stem}.parquet",
+                        force_upload = config.force_upload or new_version
+                    )
+                    self.queries.append(
+                        base_templates.get_ctas_from_parquet_query(
+                            schema_name=schema,
+                            table_name=f"umls__{file.stem}",
+                            # this local location needs to be modified to sometimes expect a file
+                            # rather than a dir, or the generated output needs to reflect this better
+                            local_location=parquet_path / f"{file.stem}.parquet",
+                            remote_location=remote_path,
+                            table_cols=table['headers'],
+                            remote_table_cols_types=table['parquet_types'],
+                        )
+                    )
+                    progress.advance(task)
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..53d39ad
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,65 @@
+[project]
+name = "cumulus-library-umls"
+requires-python = ">= 3.10"
+dependencies = [
+    "cumulus-library >= 2.1",
+]
+description = "A Unified Medical Language System® Metathesaurus study for the Cumulus project"
+readme = "README.md"
+license = { text="Apache License 2.0" }
+keywords = ["FHIR", "SQL", "Health Informatics"]
+classifiers = [
+    "License :: OSI Approved :: Apache Software License",
+    "Operating System :: OS Independent",
+    "Programming Language :: Python :: 3",
+    "Topic :: Software Development :: Libraries :: Python Modules",
+]
+dynamic=["version"]
+[project.optional-dependencies]
+dev = [
+    "ruff == 0.2.1",
+    "pre-commit",
+]
+test = [
+    "pytest",
+    "responses"
+]
+
+[project.urls]
+Home = "https://smarthealthit.org/cumulus-a-universal-sidecar-for-a-smart-learning-healthcare-system/"
+Documentation = "https://docs.smarthealthit.org/cumulus/"
+Source = "https://github.com/smart-on-fhir/cumulus-library-umls"
+
+[build-system]
+requires = ["flit_core >=3.4,<4"]
+build-backend = "flit_core.buildapi"
+
+[tool.flit.sdist]
+include = [".sqlfluff"]
+
+[tool.pytest.ini_options]
+minversion = "6.0"
+testpaths = [
+    "tests",
+]
+
+[tool.ruff]
+target-version = "py310"
+
+[tool.ruff.lint]
+select = [
+    "A",  # prevent using keywords that clobber python builtins
+    "B",  # bugbear: security warnings
+    "E",  # pycodestyle
+    "F",  # pyflakes
+    "I",  # isort
+    "ISC",  # implicit string concatenation
+    "PLE",  # pylint errors
+    "RUF",  # the ruff developer's own rules
+    "UP",  # alert you when better syntax is available in your python version
+]
+ignore = [
+# Recommended ingore from `ruff format` due to in-project conflicts with check.
+# It's expected that this will be fixed in the coming months.
+    "ISC001"
+]
\ No newline at end of file
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_data/2000AA.zip b/tests/test_data/2000AA.zip
new file mode 100644
index 0000000000000000000000000000000000000000..898844cb6a99aa5c7c2d34cd49ae3f56db9304ab
GIT binary patch
literal 880
zcmWIWW@h1H0D;Ga%Ok)HD8bGk!(e1!VBqMe9~#2Rz`QcwD(w>xmsW5yFtU7QWME(s
z0V)lE8<1LrY5)(y0AJS-s44RbtkOUjizy(}I2fM9t%#_f{L*F%koOaaCDBX?aSaX$
zadh%=)k`kP0Xr;ufpuCe5ThBH+Gm)3*g&B5`Mvc!e9fjz$ceMs(re4g)Mr&<<30U{
z!?dOnIj+ASCu){t9IyVq_xSX8TAE@xds51NEvkGf=A_g=Z_0tQQ%-6=Khz~(_QIfG
z&&!@x@h+`(584`ztY&db{@uNGeg5(~^S<WB&Oi6}xUuaKasPS$ym<NoEf@DwrNP2l
zzEgj?KG2e>IheJ@W6hM~FS*Zt4^!q8%9pq<5W~ePm2E%YZ1=Mb`STzCt$9;_Ze8m)
zMo?fvf&mnmrx|OP$N+;t4R0_61-XHP0TikWFhiD`v(scv;W>ZLtEOunasfF_1w993
zk~xm=U<1Y-Ba<96uIQHlS_=XK3~wDlG(0h|LJ|X7cq1EznG}!>1DOUj44N>2CSpw#
zAm`yq77!DG$&g`5V>_COXo&=9JSdT1bug~rMmD|=XbZA~5ypc9AD8P{*?`VsU;)Bd
LpaFkDPGtZ9)pg!@

literal 0
HcmV?d00001

diff --git a/tests/test_data/2000AA/META/TESTTABLE.RRF b/tests/test_data/2000AA/META/TESTTABLE.RRF
new file mode 100644
index 0000000..fbbe655
--- /dev/null
+++ b/tests/test_data/2000AA/META/TESTTABLE.RRF
@@ -0,0 +1,3 @@
+TTY1|Code-1|
+TTY2|Code-2|
+TTY3|Code-3|
\ No newline at end of file
diff --git a/tests/test_data/2000AA/META/TESTTABLE.ctl b/tests/test_data/2000AA/META/TESTTABLE.ctl
new file mode 100644
index 0000000..fed3d33
--- /dev/null
+++ b/tests/test_data/2000AA/META/TESTTABLE.ctl
@@ -0,0 +1,13 @@
+options (direct=true)
+load data
+characterset UTF8 length semantics char
+infile 'TESTTABLE.RRF'
+badfile 'TESTTABLE.bad'
+discardfile 'TESTTABLE.dsc'
+truncate
+into table TESTTABLE
+fields terminated by '|'
+trailing nullcols
+(TTY	char(10),
+CODE	char(8)
+)
\ No newline at end of file
diff --git a/tests/test_data/README.MD b/tests/test_data/README.MD
new file mode 100644
index 0000000..bfc6a74
--- /dev/null
+++ b/tests/test_data/README.MD
@@ -0,0 +1,5 @@
+# Test data details
+
+2000AA.zip is generated from zipping the associated 2000AA folder. This matches
+the download format, as well as the .ctl/.rrf formats produced by the UMLS
+Metathesarus exports.
\ No newline at end of file
diff --git a/tests/test_umls_builder.py b/tests/test_umls_builder.py
new file mode 100644
index 0000000..bc8a982
--- /dev/null
+++ b/tests/test_umls_builder.py
@@ -0,0 +1,111 @@
+import os
+import pathlib
+from contextlib import nullcontext as does_not_raise
+from unittest import mock
+
+import pytest
+import responses
+
+from cumulus_library import base_utils, databases, db_config
+from cumulus_library_umls.umls import umls_builder
+
+AUTH_URL = "https://utslogin.nlm.nih.gov/validateUser"
+RELEASE_URL = "https://uts-ws.nlm.nih.gov/releases"
+DOWNLOAD_URL = "https://uts-ws.nlm.nih.gov/download"
+
+
+@pytest.fixture
+def mock_responses():
+    with responses.RequestsMock(assert_all_requests_are_fired=False) as response:
+        with open("./tests/test_data/2000AA.zip", "rb") as download_zip:
+            response.add(
+                responses.GET,
+                AUTH_URL,
+                body="true",
+                status=200,
+                content_type="application/json",
+            )
+            response.add(
+                responses.GET,
+                RELEASE_URL,
+                body="""[{
+                    "fileName": "2000AA.zip",
+                    "releaseVersion": "2000AA",
+                    "releaseDate": "2000-01-01",
+                    "downloadUrl": "https://download.nlm.nih.gov/umls/kss/2000AA/2000AA.zip",
+                    "releaseType": "UMLS Metathesaurus Level 0 Subset",
+                    "product": "UMLS",
+                    "current": true
+                    }]""",
+                status=200,
+                content_type="application/json",
+            )
+            response.add(
+                responses.GET,
+                DOWNLOAD_URL,
+                body=download_zip.read(),
+                status=200,
+                content_type="application/zip",
+            )
+            yield response
+
+
+@mock.patch.dict(
+    os.environ,
+    clear=True,
+)
+@mock.patch("pathlib.Path.resolve")
+def test_create_query(mock_resolve, mock_responses, tmp_path):
+    mock_loc = tmp_path / 'umls_builder.py'
+    mock_resolve.return_value = mock_loc
+    
+    db_config.db_type="duckdb"
+    config = base_utils.StudyConfig(
+        db = databases.DuckDatabaseBackend(f"{tmp_path}/duckdb"),
+        umls_key='123'
+    )
+    builder = umls_builder.UMLSBuilder()
+    builder.prepare_queries(
+        cursor= config.db.cursor(), 
+        schema='main',
+        config= config
+    )
+    expected = f"""CREATE TABLE IF NOT EXISTS umls__TESTTABLE AS SELECT
+    TTY,
+    CODE
+FROM read_parquet('{tmp_path / "generated_parquet/2000AA"}/TESTTABLE.parquet/*.parquet')"""
+    assert expected == builder.queries[0]
+
+
+@mock.patch.dict(
+    os.environ,
+    clear=True,
+)
+@mock.patch("pathlib.Path.resolve")
+def test_create_query_download_exists(mock_resolve, mock_responses, tmp_path):
+    mock_loc = tmp_path / 'umls_builder.py'
+    mock_resolve.return_value = mock_loc
+    
+    prev_download_path = tmp_path / "downloads/1999AA/"
+    prev_download_path.mkdir(exist_ok=True, parents=True)
+    prev_parquet_path = tmp_path / "generated_parquet/1999AA/"
+    prev_parquet_path.mkdir(exist_ok=True, parents=True)
+
+    db_config.db_type="duckdb"
+    config = base_utils.StudyConfig(
+        db = databases.DuckDatabaseBackend(f"{tmp_path}/duckdb"),
+        umls_key='123'
+    )
+    builder = umls_builder.UMLSBuilder()
+    builder.prepare_queries(
+        cursor= config.db.cursor(), 
+        schema='main',
+        config= config
+    )
+    download_dirs = sorted((tmp_path / "downloads").iterdir())
+    assert len(download_dirs) == 1
+    assert '2000AA' in str(download_dirs[0])
+    parquet_dirs = sorted((tmp_path / "generated_parquet").iterdir())
+    assert len(parquet_dirs) == 1
+    assert '2000AA' in str(parquet_dirs[0])
+