Added preliminary files.

mmcdermott · May 21, 2024 · 6637a92 · 6637a92
1 parent 1d0afa4
commit 6637a92
Show file tree

Hide file tree

Showing 7 changed files with 300 additions and 2 deletions.
diff --git a/.github/workflows/code-quality-main.yaml b/.github/workflows/code-quality-main.yaml
@@ -0,0 +1,22 @@
+# Same as `code-quality-pr.yaml` but triggered on commit to main branch
+# and runs on all files (instead of only the changed ones)
+
+name: Code Quality Main
+
+on:
+  push:
+    branches: [main]
+
+jobs:
+  code-quality:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+
+      - name: Set up Python
+        uses: actions/setup-python@v3
+
+      - name: Run pre-commits
+        uses: pre-commit/[email protected]
diff --git a/.github/workflows/code-quality-pr.yaml b/.github/workflows/code-quality-pr.yaml
@@ -0,0 +1,36 @@
+# This workflow finds which files were changed, prints them,
+# and runs `pre-commit` on those files.
+
+# Inspired by the sktime library:
+# https://github.com/alan-turing-institute/sktime/blob/main/.github/workflows/test.yml
+
+name: Code Quality PR
+
+on:
+  pull_request:
+    branches: [main, "release/*", "dev"]
+
+jobs:
+  code-quality:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+
+      - name: Set up Python
+        uses: actions/setup-python@v3
+
+      - name: Find modified files
+        id: file_changes
+        uses: trilom/[email protected]
+        with:
+          output: " "
+
+      - name: List modified files
+        run: echo '${{ steps.file_changes.outputs.files}}'
+
+      - name: Run pre-commits
+        uses: pre-commit/[email protected]
+        with:
+          extra_args: --files ${{ steps.file_changes.outputs.files}}
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
@@ -0,0 +1,43 @@
+name: Tests
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main, "release/*", "dev"]
+
+jobs:
+  run_tests_ubuntu:
+    runs-on: ubuntu-latest
+
+    strategy:
+      fail-fast: false
+
+    timeout-minutes: 30
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+
+      - name: Set up Python 3.11
+        uses: actions/setup-python@v3
+        with:
+          python-version: "3.11"
+
+      - name: Install packages
+        run: |
+          pip install -e .
+          pip install pytest
+          pip install pytest-cov[toml]
+
+      #----------------------------------------------
+      #              run test suite
+      #----------------------------------------------
+      - name: Run tests
+        run: |
+          pytest -v --doctest-modules --cov
+
+      - name: Upload coverage to Codecov
+        uses: codecov/[email protected]
+        with:
+          token: ${{ secrets.CODECOV_TOKEN }}
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,130 @@
+default_language_version:
+  python: python3.12
+
+exclude: "sample_data|docs/MIMIC_IV_tutorial/wandb_reports"
+
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.4.0
+    hooks:
+      # list of supported hooks: https://pre-commit.com/hooks.html
+      - id: trailing-whitespace
+      - id: end-of-file-fixer
+      - id: check-docstring-first
+      - id: check-yaml
+      - id: debug-statements
+      - id: detect-private-key
+      - id: check-executables-have-shebangs
+      - id: check-toml
+      - id: check-case-conflict
+      - id: check-added-large-files
+        args: [--maxkb, "800"]
+
+  # python code formatting
+  - repo: https://github.com/psf/black
+    rev: 23.7.0
+    hooks:
+      - id: black
+        args: [--line-length, "110"]
+
+  # python import sorting
+  - repo: https://github.com/PyCQA/isort
+    rev: 5.12.0
+    hooks:
+      - id: isort
+        args: ["--profile", "black", "--filter-files", "-o", "wandb"]
+
+  - repo: https://github.com/PyCQA/autoflake
+    rev: v2.2.0
+    hooks:
+      - id: autoflake
+
+  # python upgrading syntax to newer version
+  - repo: https://github.com/asottile/pyupgrade
+    rev: v3.10.1
+    hooks:
+      - id: pyupgrade
+        args: [--py310-plus]
+
+  # python docstring formatting
+  - repo: https://github.com/myint/docformatter
+    rev: v1.7.5
+    hooks:
+      - id: docformatter
+        args: [--in-place, --wrap-summaries=110, --wrap-descriptions=110]
+
+  # python check (PEP8), programming errors and code complexity
+  - repo: https://github.com/PyCQA/flake8
+    rev: 6.1.0
+    hooks:
+      - id: flake8
+        args:
+          [
+            "--max-complexity=10",
+            "--extend-ignore",
+            "E402,E701,E251,E226,E302,W504,E704,E402,E401,C901,E203",
+            "--max-line-length=110",
+            "--exclude",
+            "logs/*,data/*",
+            "--per-file-ignores",
+            "__init__.py:F401",
+          ]
+
+  # yaml formatting
+  - repo: https://github.com/pre-commit/mirrors-prettier
+    rev: v3.0.3
+    hooks:
+      - id: prettier
+        types: [yaml]
+        exclude: "environment.yaml"
+
+  # shell scripts linter
+  - repo: https://github.com/shellcheck-py/shellcheck-py
+    rev: v0.9.0.5
+    hooks:
+      - id: shellcheck
+
+  # md formatting
+  - repo: https://github.com/executablebooks/mdformat
+    rev: 0.7.17
+    hooks:
+      - id: mdformat
+        args: ["--number"]
+        additional_dependencies:
+          - mdformat-gfm
+          - mdformat-tables
+          - mdformat_frontmatter
+          - mdformat-myst
+          - mdformat-black
+          - mdformat-config
+          - mdformat-shfmt
+
+  # word spelling linter
+  - repo: https://github.com/codespell-project/codespell
+    rev: v2.2.5
+    hooks:
+      - id: codespell
+        args:
+          - --skip=logs/**,data/**,*.ipynb,*.bib,env.yml,env_cpu.yml,*.svg,poetry.lock
+          - --ignore-words-list=ehr
+
+  # jupyter notebook cell output clearing
+  - repo: https://github.com/kynan/nbstripout
+    rev: 0.6.1
+    hooks:
+      - id: nbstripout
+
+  # jupyter notebook linting
+  - repo: https://github.com/nbQA-dev/nbQA
+    rev: 1.7.0
+    hooks:
+      - id: nbqa-black
+        args: ["--line-length=110"]
+      - id: nbqa-isort
+        args: ["--profile=black"]
+      - id: nbqa-flake8
+        args:
+          [
+            "--extend-ignore=E203,E402,E501,F401,F841",
+            "--exclude=logs/*,data/*",
+          ]
diff --git a/README.md b/README.md
@@ -1,2 +1,42 @@
-# MEDS_Tabular_AutoML
-Limited automatic tabular ML pipelines for generic MEDS datasets.
+# Scalable tabularization and tabular feature usage utilities over generic MEDS datasets
+This repository provides utilities and scripts to run limited automatic tabular ML pipelines for generic MEDS
+datasets.
+
+Why not other systems?
+  - [TemporAI](https://github.com/vanderschaarlab/temporai) is the most natural competitor, and already
+    supports AutoML capabilities. However, TemporAI (as of now) does not support generic MEDS datasets, and it
+    is not clear if their AutoML systems will scale to the size of datasets we need to support. But, further
+    investigation is needed, and it may be the case that the best solution here is simply to write a custom
+    data source for MEDS data within TemporAI and leverage their tools.
+
+# Installation
+Clone this repository and install the requirements by running `pip install .` in the root directory.
+
+# Usage
+This repository consists of two key pieces:
+  1. Construction of and efficient loading of tabular (flat, non-longitudinal) summary features describing
+     patient records in MEDS over arbitrary time-windows (e.g. 1 year, 6 months, etc.) either backwards or
+     forwards in time from a given index date. Naturally, only "look-back" windows should be used for
+     future-event prediction tasks; however, the capability to summarize "look-ahead" windows is also useful
+     for characterizing and describing the differences between patient populations statistically.
+  2. Running basic AutoML pipelines over these tabular features to predict arbitrary binary classification
+     downstream tasks defined over these datasets. The "AutoML" part of this is not particularly advanced --
+     what is more advanced is the efficient construction, storage, and loading of tabular features for the
+     candidate AutoML models, enabling a far more extensive search over different featurization strategies.
+
+## Feature Construction, Storage, and Loading
+
+## AutoML Pipelines
+
+# TODOs
+  1. Leverage the "event bound aggregation" capabilities of [ESGPT Task
+     Select](https://github.com/justin13601/ESGPTTaskQuerying/) to construct tabular summary features for
+     event-bound historical windows (e.g., until the prior admission, until the last diagnosis of some type,
+     etc.).
+  2. Support more feature aggregation functions.
+  3. Probably rename this repository, as the focus is really more on the tabularization and feature usage
+     utilities than on the AutoML pipelines themselves.
+  4. Import, rather than reimplement, the mapper utilities from the MEDS preprocessing repository.
+  5. Investigate the feasibility of using TemporAI for this task.
+  6. Consider splitting the feature construction and AutoML pipeline parts of this repository into separate
+     repositories.
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,27 @@
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "MEDS_tabularization"
+version = "0.0.1"
+authors = [
+  { name="Matthew McDermott", email="[email protected]" },
+]
+description = "TODO"
+readme = "README.md"
+requires-python = ">=3.12"
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "License :: OSI Approved :: MIT License",
+    "Operating System :: OS Independent",
+]
+dependencies = ["polars", "pyarrow", "loguru", "hydra-core", "numpy"]
+
+[project.optional-dependencies]
+dev = ["pre-commit"]
+tests = ["pytest", "pytest-cov", "rootutils"]
+
+[project.urls]
+Homepage = "https://github.com/mmcdermott/MEDS_polars_functions"
+Issues = "https://github.com/mmcdermott/MEDS_polars_functions/issues"
diff --git a/src/MEDS_tabular_automl/__init__.py b/src/MEDS_tabular_automl/__init__.py