-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
1d0afa4
commit 6637a92
Showing
7 changed files
with
300 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
# Same as `code-quality-pr.yaml` but triggered on commit to main branch | ||
# and runs on all files (instead of only the changed ones) | ||
|
||
name: Code Quality Main | ||
|
||
on: | ||
push: | ||
branches: [main] | ||
|
||
jobs: | ||
code-quality: | ||
runs-on: ubuntu-latest | ||
|
||
steps: | ||
- name: Checkout | ||
uses: actions/checkout@v3 | ||
|
||
- name: Set up Python | ||
uses: actions/setup-python@v3 | ||
|
||
- name: Run pre-commits | ||
uses: pre-commit/[email protected] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
# This workflow finds which files were changed, prints them, | ||
# and runs `pre-commit` on those files. | ||
|
||
# Inspired by the sktime library: | ||
# https://github.com/alan-turing-institute/sktime/blob/main/.github/workflows/test.yml | ||
|
||
name: Code Quality PR | ||
|
||
on: | ||
pull_request: | ||
branches: [main, "release/*", "dev"] | ||
|
||
jobs: | ||
code-quality: | ||
runs-on: ubuntu-latest | ||
|
||
steps: | ||
- name: Checkout | ||
uses: actions/checkout@v3 | ||
|
||
- name: Set up Python | ||
uses: actions/setup-python@v3 | ||
|
||
- name: Find modified files | ||
id: file_changes | ||
uses: trilom/[email protected] | ||
with: | ||
output: " " | ||
|
||
- name: List modified files | ||
run: echo '${{ steps.file_changes.outputs.files}}' | ||
|
||
- name: Run pre-commits | ||
uses: pre-commit/[email protected] | ||
with: | ||
extra_args: --files ${{ steps.file_changes.outputs.files}} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
name: Tests | ||
|
||
on: | ||
push: | ||
branches: [main] | ||
pull_request: | ||
branches: [main, "release/*", "dev"] | ||
|
||
jobs: | ||
run_tests_ubuntu: | ||
runs-on: ubuntu-latest | ||
|
||
strategy: | ||
fail-fast: false | ||
|
||
timeout-minutes: 30 | ||
|
||
steps: | ||
- name: Checkout | ||
uses: actions/checkout@v3 | ||
|
||
- name: Set up Python 3.11 | ||
uses: actions/setup-python@v3 | ||
with: | ||
python-version: "3.11" | ||
|
||
- name: Install packages | ||
run: | | ||
pip install -e . | ||
pip install pytest | ||
pip install pytest-cov[toml] | ||
#---------------------------------------------- | ||
# run test suite | ||
#---------------------------------------------- | ||
- name: Run tests | ||
run: | | ||
pytest -v --doctest-modules --cov | ||
- name: Upload coverage to Codecov | ||
uses: codecov/[email protected] | ||
with: | ||
token: ${{ secrets.CODECOV_TOKEN }} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,130 @@ | ||
default_language_version: | ||
python: python3.12 | ||
|
||
exclude: "sample_data|docs/MIMIC_IV_tutorial/wandb_reports" | ||
|
||
repos: | ||
- repo: https://github.com/pre-commit/pre-commit-hooks | ||
rev: v4.4.0 | ||
hooks: | ||
# list of supported hooks: https://pre-commit.com/hooks.html | ||
- id: trailing-whitespace | ||
- id: end-of-file-fixer | ||
- id: check-docstring-first | ||
- id: check-yaml | ||
- id: debug-statements | ||
- id: detect-private-key | ||
- id: check-executables-have-shebangs | ||
- id: check-toml | ||
- id: check-case-conflict | ||
- id: check-added-large-files | ||
args: [--maxkb, "800"] | ||
|
||
# python code formatting | ||
- repo: https://github.com/psf/black | ||
rev: 23.7.0 | ||
hooks: | ||
- id: black | ||
args: [--line-length, "110"] | ||
|
||
# python import sorting | ||
- repo: https://github.com/PyCQA/isort | ||
rev: 5.12.0 | ||
hooks: | ||
- id: isort | ||
args: ["--profile", "black", "--filter-files", "-o", "wandb"] | ||
|
||
- repo: https://github.com/PyCQA/autoflake | ||
rev: v2.2.0 | ||
hooks: | ||
- id: autoflake | ||
|
||
# python upgrading syntax to newer version | ||
- repo: https://github.com/asottile/pyupgrade | ||
rev: v3.10.1 | ||
hooks: | ||
- id: pyupgrade | ||
args: [--py310-plus] | ||
|
||
# python docstring formatting | ||
- repo: https://github.com/myint/docformatter | ||
rev: v1.7.5 | ||
hooks: | ||
- id: docformatter | ||
args: [--in-place, --wrap-summaries=110, --wrap-descriptions=110] | ||
|
||
# python check (PEP8), programming errors and code complexity | ||
- repo: https://github.com/PyCQA/flake8 | ||
rev: 6.1.0 | ||
hooks: | ||
- id: flake8 | ||
args: | ||
[ | ||
"--max-complexity=10", | ||
"--extend-ignore", | ||
"E402,E701,E251,E226,E302,W504,E704,E402,E401,C901,E203", | ||
"--max-line-length=110", | ||
"--exclude", | ||
"logs/*,data/*", | ||
"--per-file-ignores", | ||
"__init__.py:F401", | ||
] | ||
|
||
# yaml formatting | ||
- repo: https://github.com/pre-commit/mirrors-prettier | ||
rev: v3.0.3 | ||
hooks: | ||
- id: prettier | ||
types: [yaml] | ||
exclude: "environment.yaml" | ||
|
||
# shell scripts linter | ||
- repo: https://github.com/shellcheck-py/shellcheck-py | ||
rev: v0.9.0.5 | ||
hooks: | ||
- id: shellcheck | ||
|
||
# md formatting | ||
- repo: https://github.com/executablebooks/mdformat | ||
rev: 0.7.17 | ||
hooks: | ||
- id: mdformat | ||
args: ["--number"] | ||
additional_dependencies: | ||
- mdformat-gfm | ||
- mdformat-tables | ||
- mdformat_frontmatter | ||
- mdformat-myst | ||
- mdformat-black | ||
- mdformat-config | ||
- mdformat-shfmt | ||
|
||
# word spelling linter | ||
- repo: https://github.com/codespell-project/codespell | ||
rev: v2.2.5 | ||
hooks: | ||
- id: codespell | ||
args: | ||
- --skip=logs/**,data/**,*.ipynb,*.bib,env.yml,env_cpu.yml,*.svg,poetry.lock | ||
- --ignore-words-list=ehr | ||
|
||
# jupyter notebook cell output clearing | ||
- repo: https://github.com/kynan/nbstripout | ||
rev: 0.6.1 | ||
hooks: | ||
- id: nbstripout | ||
|
||
# jupyter notebook linting | ||
- repo: https://github.com/nbQA-dev/nbQA | ||
rev: 1.7.0 | ||
hooks: | ||
- id: nbqa-black | ||
args: ["--line-length=110"] | ||
- id: nbqa-isort | ||
args: ["--profile=black"] | ||
- id: nbqa-flake8 | ||
args: | ||
[ | ||
"--extend-ignore=E203,E402,E501,F401,F841", | ||
"--exclude=logs/*,data/*", | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,42 @@ | ||
# MEDS_Tabular_AutoML | ||
Limited automatic tabular ML pipelines for generic MEDS datasets. | ||
# Scalable tabularization and tabular feature usage utilities over generic MEDS datasets | ||
This repository provides utilities and scripts to run limited automatic tabular ML pipelines for generic MEDS | ||
datasets. | ||
|
||
Why not other systems? | ||
- [TemporAI](https://github.com/vanderschaarlab/temporai) is the most natural competitor, and already | ||
supports AutoML capabilities. However, TemporAI (as of now) does not support generic MEDS datasets, and it | ||
is not clear if their AutoML systems will scale to the size of datasets we need to support. But, further | ||
investigation is needed, and it may be the case that the best solution here is simply to write a custom | ||
data source for MEDS data within TemporAI and leverage their tools. | ||
|
||
# Installation | ||
Clone this repository and install the requirements by running `pip install .` in the root directory. | ||
|
||
# Usage | ||
This repository consists of two key pieces: | ||
1. Construction of and efficient loading of tabular (flat, non-longitudinal) summary features describing | ||
patient records in MEDS over arbitrary time-windows (e.g. 1 year, 6 months, etc.) either backwards or | ||
forwards in time from a given index date. Naturally, only "look-back" windows should be used for | ||
future-event prediction tasks; however, the capability to summarize "look-ahead" windows is also useful | ||
for characterizing and describing the differences between patient populations statistically. | ||
2. Running basic AutoML pipelines over these tabular features to predict arbitrary binary classification | ||
downstream tasks defined over these datasets. The "AutoML" part of this is not particularly advanced -- | ||
what is more advanced is the efficient construction, storage, and loading of tabular features for the | ||
candidate AutoML models, enabling a far more extensive search over different featurization strategies. | ||
|
||
## Feature Construction, Storage, and Loading | ||
|
||
## AutoML Pipelines | ||
|
||
# TODOs | ||
1. Leverage the "event bound aggregation" capabilities of [ESGPT Task | ||
Select](https://github.com/justin13601/ESGPTTaskQuerying/) to construct tabular summary features for | ||
event-bound historical windows (e.g., until the prior admission, until the last diagnosis of some type, | ||
etc.). | ||
2. Support more feature aggregation functions. | ||
3. Probably rename this repository, as the focus is really more on the tabularization and feature usage | ||
utilities than on the AutoML pipelines themselves. | ||
4. Import, rather than reimplement, the mapper utilities from the MEDS preprocessing repository. | ||
5. Investigate the feasibility of using TemporAI for this task. | ||
6. Consider splitting the feature construction and AutoML pipeline parts of this repository into separate | ||
repositories. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
[build-system] | ||
requires = ["setuptools>=61.0"] | ||
build-backend = "setuptools.build_meta" | ||
|
||
[project] | ||
name = "MEDS_tabularization" | ||
version = "0.0.1" | ||
authors = [ | ||
{ name="Matthew McDermott", email="[email protected]" }, | ||
] | ||
description = "TODO" | ||
readme = "README.md" | ||
requires-python = ">=3.12" | ||
classifiers = [ | ||
"Programming Language :: Python :: 3", | ||
"License :: OSI Approved :: MIT License", | ||
"Operating System :: OS Independent", | ||
] | ||
dependencies = ["polars", "pyarrow", "loguru", "hydra-core", "numpy"] | ||
|
||
[project.optional-dependencies] | ||
dev = ["pre-commit"] | ||
tests = ["pytest", "pytest-cov", "rootutils"] | ||
|
||
[project.urls] | ||
Homepage = "https://github.com/mmcdermott/MEDS_polars_functions" | ||
Issues = "https://github.com/mmcdermott/MEDS_polars_functions/issues" |
Empty file.