Merge pull request #5 from mmcdermott/clean

Clean
mmcdermott · Jun 5, 2024 · dba36ce · dba36ce
2 parents 6240c8a + ec73910
commit dba36ce
Show file tree

Hide file tree

Showing 30 changed files with 3,583 additions and 55 deletions.
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
@@ -19,23 +19,21 @@ jobs:
       - name: Checkout
         uses: actions/checkout@v3
 
-      - name: Set up Python 3.11
+      - name: Set up Python 3.12
         uses: actions/setup-python@v3
         with:
-          python-version: "3.11"
+          python-version: "3.12"
 
       - name: Install packages
         run: |
-          pip install -e .
-          pip install pytest
-          pip install pytest-cov[toml]
+          pip install -e .[tests]
 
       #----------------------------------------------
       #              run test suite
       #----------------------------------------------
       - name: Run tests
         run: |
-          pytest -v --doctest-modules --cov
+          pytest -v --doctest-modules --cov --ignore=hf_cohort/
 
       - name: Upload coverage to Codecov
         uses: codecov/[email protected]

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -38,6 +38,7 @@ repos:
     rev: v2.2.0
     hooks:
       - id: autoflake
+        args: [--in-place, --remove-all-unused-imports]
 
   # python upgrading syntax to newer version
   - repo: https://github.com/asottile/pyupgrade

diff --git a/README.md b/README.md
@@ -42,6 +42,36 @@ This repository consists of two key pieces:
    what is more advanced is the efficient construction, storage, and loading of tabular features for the
    candidate AutoML models, enabling a far more extensive search over different featurization strategies.
 
+### Scripts and Examples
+
+See `tests/test_tabularize_integration.py` for an example of the end-to-end pipeline being run on synthetic data. This
+script is a functional test that is also run with `pytest` to verify the correctness of the algorithm.
+
+#### Core Scripts:
+
+1. `scripts/identify_columns.py` loads all training shard to identify which feature columns
+   to generate tabular data for.
+2. `scripts/tabularize_static.py` Iterates through shards and generates tabular vectors for
+   each patient. There is a single row per patient for each shard.
+3. `scripts/summarize_over_windows.py` For each shard, iterates through window sizes and aggregations to and
+   horizontally concatenates the outputs to generate the final tabular representations at every event time for
+   every patient.
+4. `scripts/tabularize_merge` Aligns the time-series window aggregations (generated in the previous step) with
+   the static tabular vectors and caches them for training.
+5. `scripts/hf_cohort/aces_task_extraction.py` Generates the task labels and caches them with the event_id
+   indexes which align them with the nearest prior event in the tabular data.
+6. `scripts/xgboost_sweep.py` Tunes XGboost on methods. Iterates through the labels and corresponding tabular data.
+
+We run this on an example dataset using the following bash scripts in sequence:
+
+```bash
+bash hf_cohort_shard.sh  # processes the dataset into meds format
+bash hf_cohort_e2e.sh  # performs (steps 1-4 above)
+bash hf_cohort/aces_task.sh  # generates labels (step 5)
+bash xgboost.sh  # trains xgboos (step 6)
+```
+
+
 ## Feature Construction, Storage, and Loading
 
 Tabularization of a (raw) MEDS dataset is done by running the `scripts/data/tabularize.py` script. This script

diff --git a/configs/tabularize.yaml b/configs/tabularize.yaml
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,27 +1,38 @@
-[build-system]
-requires = ["setuptools>=61.0"]
-build-backend = "setuptools.build_meta"
-
 [project]
 name = "MEDS_tabularization"
 version = "0.0.1"
 authors = [
   { name="Matthew McDermott", email="[email protected]" },
+  { name="Nassim Oufattole", email="[email protected]" },
+  { name="Teya Bergamaschi", email="[email protected]" },
 ]
-description = "TODO"
+description = "Scalable Tabularization of MEDS format Time-Series data"
 readme = "README.md"
 requires-python = ">=3.12"
 classifiers = [
     "Programming Language :: Python :: 3",
     "License :: OSI Approved :: MIT License",
     "Operating System :: OS Independent",
 ]
-dependencies = ["polars", "pyarrow", "loguru", "hydra-core", "numpy"]
+dependencies = ["polars", "pyarrow", "loguru", "hydra-core", "numpy", "scipy", "pandas", "tqdm", "xgboost", "scikit-learn", "hydra-optuna-sweeper", "hydra-joblib-launcher", "ml-mixins"]
+
+[project.scripts]
+meds-tab-describe = "MEDS_tabular_automl.scripts.describe_codes:main"
+meds-tab-tabularize-static = "MEDS_tabular_automl.scripts.tabularize_static:main"
+meds-tab-tabularize-time-series = "MEDS_tabular_automl.scripts.tabularize_time_series:main"
+meds-tab-cache-task = "MEDS_tabular_automl.scripts.cache_task:main"
+meds-tab-xgboost = "MEDS_tabular_automl.scripts.launch_xgboost:main"
+meds-tab-xgboost-sweep = "MEDS_tabular_automl.scripts.sweep_xgboost:main"
 
 [project.optional-dependencies]
 dev = ["pre-commit"]
 tests = ["pytest", "pytest-cov", "rootutils"]
+profiling = ["mprofile", "matplotlib"]
+
+[build-system]
+requires = ["setuptools>=61.0", "setuptools-scm>=8.0", "wheel"]
+build-backend = "setuptools.build_meta"
 
 [project.urls]
-Homepage = "https://github.com/mmcdermott/MEDS_polars_functions"
-Issues = "https://github.com/mmcdermott/MEDS_polars_functions/issues"
+Homepage = "https://github.com/mmcdermott/MEDS_Tabular_AutoML"
+Issues = "https://github.com/mmcdermott/MEDS_Tabular_AutoML/issues"
diff --git a/src/MEDS_tabular_automl/configs/__init__.py b/src/MEDS_tabular_automl/configs/__init__.py
diff --git a/src/MEDS_tabular_automl/configs/default.yaml b/src/MEDS_tabular_automl/configs/default.yaml
@@ -0,0 +1,17 @@
+MEDS_cohort_dir: ???
+do_overwrite: False
+seed: 1
+tqdm: False
+worker: 0
+loguru_init: False
+
+log_dir: ${output_dir}/.logs/
+
+hydra:
+  verbose: False
+  job:
+    name: MEDS_TAB_${name}_${worker}_{now:%Y-%m-%d_%H-%M-%S}
+  sweep:
+    dir: ${log_dir}
+  run:
+    dir: ${log_dir}
diff --git a/src/MEDS_tabular_automl/configs/describe_codes.yaml b/src/MEDS_tabular_automl/configs/describe_codes.yaml
@@ -0,0 +1,14 @@
+defaults:
+  - default
+  - _self_
+
+# split we wish to get metadata for
+split: train
+# Raw data, must have a subdirectory "train" with the training data split
+input_dir: ${MEDS_cohort_dir}/final_cohort/${split}
+# Where to store output code frequency data
+cache_dir: ${MEDS_cohort_dir}/.cache
+output_dir: ${MEDS_cohort_dir}
+output_filepath: ${output_dir}/code_metadata.parquet
+
+name: describe_codes
diff --git a/src/MEDS_tabular_automl/configs/launch_xgboost.yaml b/src/MEDS_tabular_automl/configs/launch_xgboost.yaml
@@ -0,0 +1,81 @@
+defaults:
+  - default
+  - tabularization: default
+  - _self_
+
+task_name: task
+# min code frequency used for modeling, can potentially sweep over different values.
+modeling_min_code_freq: 10
+
+# Task cached data dir
+input_dir: ${MEDS_cohort_dir}/${task_name}/task_cache
+# Directory with task labels
+input_label_dir: ${MEDS_cohort_dir}/${task_name}/labels
+# Where to output the model and cached data
+output_dir: ${MEDS_cohort_dir}/model/model_${now:%Y-%m-%d_%H-%M-%S}
+output_filepath: ${output_dir}/model_metadata.parquet
+cache_dir: ${MEDS_cohort_dir}/.cache
+
+# Model parameters
+model_params:
+  num_boost_round: 1000
+  early_stopping_rounds: 5
+  model:
+    booster: gbtree
+    device: cpu
+    nthread: 1
+    tree_method: hist
+    objective: binary:logistic
+  iterator:
+    keep_data_in_memory: True
+    binarize_task: True
+
+# Define search space for Optuna
+optuna:
+  study_name: xgboost_sweep_${now:%Y-%m-%d_%H-%M-%S}
+  storage: null
+  load_if_exists: False
+  direction: minimize
+  sampler: null
+  pruner: null
+
+  n_trials: 10
+  n_jobs: 1
+  show_progress_bar: False
+
+  params:
+    suggest_categorical:
+      window_sizes: ${generate_permutations:${tabularization.window_sizes}}
+      aggs: ${generate_permutations:${tabularization.aggs}}
+    suggest_float:
+      eta:
+        low: .001
+        high: 1
+        log: True
+      lambda:
+        low: .001
+        high: 1
+        log: True
+      alpha:
+        low: .001
+        high: 1
+        log: True
+      subsample:
+        low: 0.5
+        high: 1
+      min_child_weight:
+        low: 1e-2
+        high: 100
+    suggest_int:
+      num_boost_round:
+        low: 10
+        high: 1000
+      max_depth:
+        low: 2
+        high: 16
+      min_code_inclusion_frequency:
+        low: 10
+        high: 1_000_000
+        log: True
+
+name: launch_xgboost
diff --git a/src/MEDS_tabular_automl/configs/tabularization.yaml b/src/MEDS_tabular_automl/configs/tabularization.yaml
@@ -0,0 +1,12 @@
+defaults:
+  - default
+  - tabularization: default
+  - _self_
+
+# Raw data
+# Where the code metadata is stored
+input_code_metadata_fp: ${MEDS_cohort_dir}/code_metadata.parquet
+input_dir: ${MEDS_cohort_dir}/final_cohort
+output_dir: ${MEDS_cohort_dir}/tabularize
+
+name: tabularization
diff --git a/src/MEDS_tabular_automl/configs/tabularization/__init__.py b/src/MEDS_tabular_automl/configs/tabularization/__init__.py
diff --git a/src/MEDS_tabular_automl/configs/tabularization/default.yaml b/src/MEDS_tabular_automl/configs/tabularization/default.yaml
@@ -0,0 +1,22 @@
+# User inputs
+allowed_codes: null
+min_code_inclusion_frequency: 10
+filtered_code_metadata_fp: ${MEDS_cohort_dir}/tabularized_code_metadata.parquet
+window_sizes:
+  - "1d"
+  - "7d"
+  - "30d"
+  - "365d"
+  - "full"
+aggs:
+  - "static/present"
+  - "static/first"
+  - "code/count"
+  - "value/count"
+  - "value/sum"
+  - "value/sum_sqd"
+  - "value/min"
+  - "value/max"
+
+# Resolved inputs
+_resolved_codes: ${filter_to_codes:${tabularization.allowed_codes},${tabularization.min_code_inclusion_frequency},${tabularization.filtered_code_metadata_fp}}
diff --git a/src/MEDS_tabular_automl/configs/task_specific_caching.yaml b/src/MEDS_tabular_automl/configs/task_specific_caching.yaml
@@ -0,0 +1,14 @@
+defaults:
+  - default
+  - tabularization: default
+  - _self_
+task_name: task
+
+# Tabularized Data
+input_dir: ${MEDS_cohort_dir}/tabularize
+# Where the labels are stored, with columns patient_id, timestamp, label
+input_label_dir: ${MEDS_cohort_dir}/${task_name}/labels
+# Where to output the task specific tabularized data
+output_dir: ${MEDS_cohort_dir}/${task_name}/task_cache
+
+name: task_specific_caching