diff --git a/.github/workflows/code-quality-main.yaml b/.github/workflows/code-quality-main.yaml new file mode 100644 index 0000000..3703b1f --- /dev/null +++ b/.github/workflows/code-quality-main.yaml @@ -0,0 +1,22 @@ +# Same as `code-quality-pr.yaml` but triggered on commit to main branch +# and runs on all files (instead of only the changed ones) + +name: Code Quality Main + +on: + push: + branches: [main] + +jobs: + code-quality: + runs-on: ubuntu-latest + + steps: + - name: Checkout + uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v3 + + - name: Run pre-commits + uses: pre-commit/action@v3.0.1 diff --git a/.github/workflows/code-quality-pr.yaml b/.github/workflows/code-quality-pr.yaml new file mode 100644 index 0000000..a97d2c0 --- /dev/null +++ b/.github/workflows/code-quality-pr.yaml @@ -0,0 +1,36 @@ +# This workflow finds which files were changed, prints them, +# and runs `pre-commit` on those files. + +# Inspired by the sktime library: +# https://github.com/alan-turing-institute/sktime/blob/main/.github/workflows/test.yml + +name: Code Quality PR + +on: + pull_request: + branches: [main, "release/*", "dev"] + +jobs: + code-quality: + runs-on: ubuntu-latest + + steps: + - name: Checkout + uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v3 + + - name: Find modified files + id: file_changes + uses: trilom/file-changes-action@v1.2.4 + with: + output: " " + + - name: List modified files + run: echo '${{ steps.file_changes.outputs.files}}' + + - name: Run pre-commits + uses: pre-commit/action@v3.0.1 + with: + extra_args: --files ${{ steps.file_changes.outputs.files}} diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml new file mode 100644 index 0000000..b32a1bd --- /dev/null +++ b/.github/workflows/tests.yaml @@ -0,0 +1,43 @@ +name: Tests + +on: + push: + branches: [main] + pull_request: + branches: [main, "release/*", "dev"] + +jobs: + run_tests_ubuntu: + runs-on: ubuntu-latest + + strategy: + fail-fast: false + + timeout-minutes: 30 + + steps: + - name: Checkout + uses: actions/checkout@v3 + + - name: Set up Python 3.11 + uses: actions/setup-python@v3 + with: + python-version: "3.11" + + - name: Install packages + run: | + pip install -e . + pip install pytest + pip install pytest-cov[toml] + + #---------------------------------------------- + # run test suite + #---------------------------------------------- + - name: Run tests + run: | + pytest -v --doctest-modules --cov + + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v4.0.1 + with: + token: ${{ secrets.CODECOV_TOKEN }} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..7540f52 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,130 @@ +default_language_version: + python: python3.12 + +exclude: "sample_data|docs/MIMIC_IV_tutorial/wandb_reports" + +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.4.0 + hooks: + # list of supported hooks: https://pre-commit.com/hooks.html + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-docstring-first + - id: check-yaml + - id: debug-statements + - id: detect-private-key + - id: check-executables-have-shebangs + - id: check-toml + - id: check-case-conflict + - id: check-added-large-files + args: [--maxkb, "800"] + + # python code formatting + - repo: https://github.com/psf/black + rev: 23.7.0 + hooks: + - id: black + args: [--line-length, "110"] + + # python import sorting + - repo: https://github.com/PyCQA/isort + rev: 5.12.0 + hooks: + - id: isort + args: ["--profile", "black", "--filter-files", "-o", "wandb"] + + - repo: https://github.com/PyCQA/autoflake + rev: v2.2.0 + hooks: + - id: autoflake + + # python upgrading syntax to newer version + - repo: https://github.com/asottile/pyupgrade + rev: v3.10.1 + hooks: + - id: pyupgrade + args: [--py310-plus] + + # python docstring formatting + - repo: https://github.com/myint/docformatter + rev: v1.7.5 + hooks: + - id: docformatter + args: [--in-place, --wrap-summaries=110, --wrap-descriptions=110] + + # python check (PEP8), programming errors and code complexity + - repo: https://github.com/PyCQA/flake8 + rev: 6.1.0 + hooks: + - id: flake8 + args: + [ + "--max-complexity=10", + "--extend-ignore", + "E402,E701,E251,E226,E302,W504,E704,E402,E401,C901,E203", + "--max-line-length=110", + "--exclude", + "logs/*,data/*", + "--per-file-ignores", + "__init__.py:F401", + ] + + # yaml formatting + - repo: https://github.com/pre-commit/mirrors-prettier + rev: v3.0.3 + hooks: + - id: prettier + types: [yaml] + exclude: "environment.yaml" + + # shell scripts linter + - repo: https://github.com/shellcheck-py/shellcheck-py + rev: v0.9.0.5 + hooks: + - id: shellcheck + + # md formatting + - repo: https://github.com/executablebooks/mdformat + rev: 0.7.17 + hooks: + - id: mdformat + args: ["--number"] + additional_dependencies: + - mdformat-gfm + - mdformat-tables + - mdformat_frontmatter + - mdformat-myst + - mdformat-black + - mdformat-config + - mdformat-shfmt + + # word spelling linter + - repo: https://github.com/codespell-project/codespell + rev: v2.2.5 + hooks: + - id: codespell + args: + - --skip=logs/**,data/**,*.ipynb,*.bib,env.yml,env_cpu.yml,*.svg,poetry.lock + - --ignore-words-list=ehr + + # jupyter notebook cell output clearing + - repo: https://github.com/kynan/nbstripout + rev: 0.6.1 + hooks: + - id: nbstripout + + # jupyter notebook linting + - repo: https://github.com/nbQA-dev/nbQA + rev: 1.7.0 + hooks: + - id: nbqa-black + args: ["--line-length=110"] + - id: nbqa-isort + args: ["--profile=black"] + - id: nbqa-flake8 + args: + [ + "--extend-ignore=E203,E402,E501,F401,F841", + "--exclude=logs/*,data/*", + ] diff --git a/README.md b/README.md index ab10ce4..9abdc74 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,42 @@ -# MEDS_Tabular_AutoML -Limited automatic tabular ML pipelines for generic MEDS datasets. +# Scalable tabularization and tabular feature usage utilities over generic MEDS datasets +This repository provides utilities and scripts to run limited automatic tabular ML pipelines for generic MEDS +datasets. + +Why not other systems? + - [TemporAI](https://github.com/vanderschaarlab/temporai) is the most natural competitor, and already + supports AutoML capabilities. However, TemporAI (as of now) does not support generic MEDS datasets, and it + is not clear if their AutoML systems will scale to the size of datasets we need to support. But, further + investigation is needed, and it may be the case that the best solution here is simply to write a custom + data source for MEDS data within TemporAI and leverage their tools. + +# Installation +Clone this repository and install the requirements by running `pip install .` in the root directory. + +# Usage +This repository consists of two key pieces: + 1. Construction of and efficient loading of tabular (flat, non-longitudinal) summary features describing + patient records in MEDS over arbitrary time-windows (e.g. 1 year, 6 months, etc.) either backwards or + forwards in time from a given index date. Naturally, only "look-back" windows should be used for + future-event prediction tasks; however, the capability to summarize "look-ahead" windows is also useful + for characterizing and describing the differences between patient populations statistically. + 2. Running basic AutoML pipelines over these tabular features to predict arbitrary binary classification + downstream tasks defined over these datasets. The "AutoML" part of this is not particularly advanced -- + what is more advanced is the efficient construction, storage, and loading of tabular features for the + candidate AutoML models, enabling a far more extensive search over different featurization strategies. + +## Feature Construction, Storage, and Loading + +## AutoML Pipelines + +# TODOs + 1. Leverage the "event bound aggregation" capabilities of [ESGPT Task + Select](https://github.com/justin13601/ESGPTTaskQuerying/) to construct tabular summary features for + event-bound historical windows (e.g., until the prior admission, until the last diagnosis of some type, + etc.). + 2. Support more feature aggregation functions. + 3. Probably rename this repository, as the focus is really more on the tabularization and feature usage + utilities than on the AutoML pipelines themselves. + 4. Import, rather than reimplement, the mapper utilities from the MEDS preprocessing repository. + 5. Investigate the feasibility of using TemporAI for this task. + 6. Consider splitting the feature construction and AutoML pipeline parts of this repository into separate + repositories. diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..1aa7f41 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,27 @@ +[build-system] +requires = ["setuptools>=61.0"] +build-backend = "setuptools.build_meta" + +[project] +name = "MEDS_tabularization" +version = "0.0.1" +authors = [ + { name="Matthew McDermott", email="mattmcdermott8@gmail.com" }, +] +description = "TODO" +readme = "README.md" +requires-python = ">=3.12" +classifiers = [ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", +] +dependencies = ["polars", "pyarrow", "loguru", "hydra-core", "numpy"] + +[project.optional-dependencies] +dev = ["pre-commit"] +tests = ["pytest", "pytest-cov", "rootutils"] + +[project.urls] +Homepage = "https://github.com/mmcdermott/MEDS_polars_functions" +Issues = "https://github.com/mmcdermott/MEDS_polars_functions/issues" diff --git a/src/MEDS_tabular_automl/__init__.py b/src/MEDS_tabular_automl/__init__.py new file mode 100644 index 0000000..e69de29