Merge pull request #81 from mmcdermott/dev

added autogluon support, more models, more preprocessing strategies
mmcdermott · Sep 10, 2024 · 07001be · 07001be
2 parents 0ec527a + f7e03dd
commit 07001be
Show file tree

Hide file tree

Showing 68 changed files with 2,677 additions and 1,126 deletions.
diff --git a/.github/workflows/code-quality-main.yaml b/.github/workflows/code-quality-main.yaml
@@ -13,10 +13,12 @@ jobs:
 
     steps:
       - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Set up Python
-        uses: actions/setup-python@v3
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
 
       - name: Run pre-commits
         uses: pre-commit/[email protected]
diff --git a/.github/workflows/code-quality-pr.yaml b/.github/workflows/code-quality-pr.yaml
@@ -16,10 +16,12 @@ jobs:
 
     steps:
       - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Set up Python
-        uses: actions/setup-python@v3
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
 
       - name: Find modified files
         id: file_changes

diff --git a/.github/workflows/publish-to-pypi.yml b/.github/workflows/publish-to-pypi.yml
@@ -12,7 +12,7 @@ jobs:
       - name: Set up Python
         uses: actions/setup-python@v5
         with:
-          python-version: "3.x"
+          python-version: "3.11"
       - name: Install pypa/build
         run: >-
           python3 -m
@@ -36,7 +36,7 @@ jobs:
     runs-on: ubuntu-latest
     environment:
       name: pypi
-      url: https://pypi.org/p/<package-name> # Replace <package-name> with your PyPI project name
+      url: https://pypi.org/p/meds-tab # Replace <package-name> with your PyPI project name
     permissions:
       id-token: write # IMPORTANT: mandatory for trusted publishing
 
@@ -91,27 +91,3 @@ jobs:
           gh release upload
           '${{ github.ref_name }}' dist/**
           --repo '${{ github.repository }}'
-
-  publish-to-testpypi:
-    name: Publish Python 🐍 distribution 📦 to TestPyPI
-    needs:
-      - build
-    runs-on: ubuntu-latest
-
-    environment:
-      name: testpypi
-      url: https://test.pypi.org/p/<package-name>
-
-    permissions:
-      id-token: write # IMPORTANT: mandatory for trusted publishing
-
-    steps:
-      - name: Download all the dists
-        uses: actions/download-artifact@v3
-        with:
-          name: python-package-distributions
-          path: dist/
-      - name: Publish distribution 📦 to TestPyPI
-        uses: pypa/gh-action-pypi-publish@release/v1
-        with:
-          repository-url: https://test.pypi.org/legacy/
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
@@ -12,17 +12,19 @@ jobs:
 
     strategy:
       fail-fast: false
+      matrix:
+        python-version: ["3.11", "3.12"]
 
     timeout-minutes: 30
 
     steps:
       - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
-      - name: Set up Python 3.12
-        uses: actions/setup-python@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
         with:
-          python-version: "3.12"
+          python-version: ${{ matrix.python-version }}
 
       - name: Install packages
         run: |

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,7 +1,5 @@
 default_language_version:
-  python: python3.12
-
-exclude: "sample_data|docs/MIMIC_IV_tutorial/wandb_reports"
+  python: python3.11
 
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks

diff --git a/MIMICIV_TUTORIAL/README.MD b/MIMICIV_TUTORIAL/README.MD
@@ -0,0 +1,55 @@
+# MIMIC-IV Example
+
+This is an example of how to extract a MEDS dataset from MIMIC-IV. All scripts in this README are assumed to
+be run **not** from this directory but from the root directory of this entire repository (e.g., one directory
+up from this one).
+
+## Extract MIMIC-IV MEDS Data
+
+### Download pre-extracted data from gpc
+
+Install the [gcloud client](https://cloud.google.com/sdk/docs/install) and then run the following command to download the MEDS data from the gcp bucket:
+
+```console
+export MIMICIV_MEDS_DIR=??? # set to the directory in which you want to store the raw MIMIC-IV data
+export OUTPUT_TABULARIZATION_DIR=??? # set to the output directory for the tabularized data
+export OUTPUT_MODEL_DIR=${OUTPUT_TABULARIZATION_DIR}/results/ # set to the base results directory
+
+cd $MIMICIV_MEDS_DIR
+gcloud storage cp gs://ehr_standardization_schema/MEDS_Extract_v0.0.7_test.zip meds_extract_0.0.7_data.zip
+unzip meds_extract_0.0.7_data.zip
+rm meds_extract_0.0.7_data.zip
+```
+
+```console
+conda create -n meds_tab python=3.12
+conda activate meds_tab
+pip install "meds-tab==0.0.5"
+```
+
+Next we need to get some labels for our tasks. We will use the `long_los` and `icu_mortality` tasks as examples.
+
+### Download pre-extracted labels from gcp:
+
+```console
+TASKS=("long_los" "icu_mortality")
+TASKS_DIR="$MIMICIV_MEDS_DIR/tasks/" # set to the directory in which you want to store all tasks
+
+mkdir -p "${TASKS_DIR}" # create a directory for the task
+
+for TASK_NAME in "${TASKS[@]}"
+do
+    gcloud storage cp "gs://ehr_standardization_schema/benchmark_v1/data/labels/${TASK_NAME}.parquet" "${TASKS_DIR}/${TASK_NAME}/0.parquet"
+done
+```
+
+## Run Tabularization and XGBoost Baseline
+
+```console
+export N_PARALLEL_WORKERS=48 # Set number of workers
+export RESHARD_DIR=??? # set to directory to output reshareded meds data
+bash MIMICIV_TUTORIAL/tabularize_meds.sh "${MIMICIV_MEDS_DIR}" "$RESHARD_DIR" $OUTPUT_TABULARIZATION_DIR \
+    "long_los,icu_mortality" $TASKS_DIR $OUTPUT_MODEL_DIR $N_PARALLEL_WORKERS \
+    "tabularization.aggs=[static/present,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max]" \
+    "tabularization.window_sizes=[2h,12h,1d,7d,30d,365d,full]"
+```
diff --git a/MIMICIV_TUTORIAL/tabularize_meds.sh b/MIMICIV_TUTORIAL/tabularize_meds.sh
@@ -0,0 +1,106 @@
+#!/usr/bin/env bash
+
+set -e
+
+# Function to print help message
+print_help() {
+    echo "Usage: $0 <MIMICIV_MEDS_DIR> <MIMICIV_MEDS_RESHARD_DIR> <OUTPUT_TABULARIZATION_DIR> <TASKS> <TASKS_DIR> <OUTPUT_MODEL_DIR> <N_PARALLEL_WORKERS> [additional arguments]"
+    echo
+    echo "Arguments:"
+    echo "  MIMICIV_MEDS_DIR            Directory containing MIMIC-IV medications data"
+    echo "  MIMICIV_MEDS_RESHARD_DIR    Directory for resharded MIMIC-IV medications data"
+    echo "  OUTPUT_TABULARIZATION_DIR   Output directory for tabularized data"
+    echo "  TASKS                       Comma-separated list of tasks to run (e.g., 'long_los,icu_mortality')"
+    echo "  TASKS_DIR                   Directory containing task-specific data"
+    echo "  OUTPUT_MODEL_DIR            Output directory for models"
+    echo "  N_PARALLEL_WORKERS          Number of parallel workers to use"
+    echo
+    echo "Additional arguments will be passed to the underlying commands."
+}
+
+# Check for help flag
+if [[ "$1" == "--help" || "$1" == "-h" ]]; then
+    print_help
+    exit 0
+fi
+
+# Check if we have the minimum required number of arguments
+if [ "$#" -lt 7 ]; then
+    echo "Error: Not enough arguments provided."
+    print_help
+    exit 1
+fi
+
+# Assign arguments to variables
+MIMICIV_MEDS_DIR="$1"
+MIMICIV_MEDS_RESHARD_DIR="$2"
+OUTPUT_TABULARIZATION_DIR="$3"
+TASKS="$4"
+TASKS_DIR="$5"
+OUTPUT_MODEL_DIR="$6"
+N_PARALLEL_WORKERS="$7"
+
+shift 7
+
+# Split the TASKS string into an array
+IFS=',' read -ra TASK_ARRAY <<< "$TASKS"
+
+# Print input arguments
+echo "Input arguments:"
+echo "MIMICIV_MEDS_DIR: $MIMICIV_MEDS_DIR"
+echo "MIMICIV_MEDS_RESHARD_DIR: $MIMICIV_MEDS_RESHARD_DIR"
+echo "OUTPUT_TABULARIZATION_DIR: $OUTPUT_TABULARIZATION_DIR"
+echo "TASKS:" "${TASK_ARRAY[@]}"
+echo "TASKS_DIR: $TASKS_DIR"
+echo "OUTPUT_MODEL_DIR: $OUTPUT_MODEL_DIR"
+echo "N_PARALLEL_WORKERS: $N_PARALLEL_WORKERS"
+echo "Additional arguments:" "$@"
+echo
+
+# Reshard the data
+echo "Resharding data"
+MEDS_transform-reshard_to_split \
+  --multirun \
+  worker="range(0,6)" \
+  hydra/launcher=joblib \
+  input_dir="$MIMICIV_MEDS_DIR" \
+  cohort_dir="$MIMICIV_MEDS_RESHARD_DIR" \
+  'stages=["reshard_to_split"]' \
+  stage="reshard_to_split" \
+  stage_configs.reshard_to_split.n_subjects_per_shard=2500 \
+  "polling_time=5"
+
+# describe codes
+echo "Describing codes"
+meds-tab-describe \
+    "input_dir=${MIMICIV_MEDS_RESHARD_DIR}/data" "output_dir=$OUTPUT_TABULARIZATION_DIR"
+
+echo "Tabularizing static data"
+meds-tab-tabularize-static \
+    "input_dir=${MIMICIV_MEDS_RESHARD_DIR}/data" "output_dir=$OUTPUT_TABULARIZATION_DIR" \
+    do_overwrite=False "$@"
+
+meds-tab-tabularize-time-series \
+    --multirun \
+    worker="range(0,$N_PARALLEL_WORKERS)" \
+    hydra/launcher=joblib \
+    "input_dir=${MIMICIV_MEDS_RESHARD_DIR}/data" "output_dir=$OUTPUT_TABULARIZATION_DIR" \
+    do_overwrite=False "$@"
+
+for TASK in "${TASK_ARRAY[@]}"
+do
+    echo "Running task_specific_caching.py for task: $TASK"
+    meds-tab-cache-task \
+    hydra/launcher=joblib \
+    "input_dir=${MIMICIV_MEDS_RESHARD_DIR}/data" "output_dir=$OUTPUT_TABULARIZATION_DIR" \
+    "input_label_dir=${TASKS_DIR}/${TASK}/" "task_name=${TASK}" do_overwrite=False "$@"
+
+  echo "Running xgboost for task: $TASK"
+  meds-tab-xgboost \
+      --multirun \
+      worker="range(0,$N_PARALLEL_WORKERS)" \
+      "input_dir=${MIMICIV_MEDS_RESHARD_DIR}/data" "output_dir=$OUTPUT_TABULARIZATION_DIR" \
+      "output_model_dir=${OUTPUT_MODEL_DIR}/${TASK}/" "task_name=$TASK" do_overwrite=False \
+      "hydra.sweeper.n_trials=1000" "hydra.sweeper.n_jobs=${N_PARALLEL_WORKERS}" \
+      "$@"
+done
diff --git a/README.md b/README.md
@@ -84,12 +84,12 @@ By following these steps, you can seamlessly transform your dataset, define nece
 
    ```console
    # Re-shard pipeline
-   # $MIMICIV_MEDS_DIR is the directory containing the input, MEDS v0.3 formatted MIMIC-IV data
+   # $MIMICIV_input_dir is the directory containing the input, MEDS v0.3 formatted MIMIC-IV data
    # $MEDS_TAB_COHORT_DIR is the directory where the re-sharded MEDS dataset will be stored, and where your model
    # will store cached files during processing by default.
    # $N_PATIENTS_PER_SHARD is the number of patients per shard you want to use.
    MEDS_transform-reshard_to_split \
-       input_dir="$MIMICIV_MEDS_DIR" \
+       input_dir="$MIMICIV_input_dir" \
        cohort_dir="$MEDS_TAB_COHORT_DIR" \
        'stages=["reshard_to_split"]' \
        stage="reshard_to_split" \
@@ -103,14 +103,14 @@ By following these steps, you can seamlessly transform your dataset, define nece
    - static codes (codes without timestamps)
    - static numerical codes (codes without timestamps but with numerical values).
 
-   This script further caches feature names and frequencies in a dataset stored in a `code_metadata.parquet` file within the `MEDS_cohort_dir` argument specified as a hydra-style command line argument.
+   This script further caches feature names and frequencies in a dataset stored in a `code_metadata.parquet` file within the `input_dir` argument specified as a hydra-style command line argument.
 
-2. **`meds-tab-tabularize-static`**: Filters and processes the dataset based on the frequency of codes, generating a tabular vector for each patient at each timestamp in the shards. Each row corresponds to a unique `patient_id` and `timestamp` combination, thus rows are duplicated across multiple timestamps for the same patient.
+2. **`meds-tab-tabularize-static`**: Filters and processes the dataset based on the frequency of codes, generating a tabular vector for each patient at each timestamp in the shards. Each row corresponds to a unique `subject_id` and `timestamp` combination, thus rows are duplicated across multiple timestamps for the same patient.
 
    **Example: Tabularizing static data** with the minimum code frequency of 10, window sizes of `[1d, 30d,  365d, full]`, and value aggregation methods of `[static/present, static/first, code/count, value/count, value/sum, value/sum_sqd, value/min, value/max]`
 
    ```console
-   meds-tab-tabularize-static MEDS_cohort_dir="path_to_data" \
+   meds-tab-tabularize-static input_dir="path_to_data" \
                                tabularization.min_code_inclusion_frequency=10 \
                                tabularization.window_sizes=[1d,30d,365d,full] \
                                do_overwrite=False \
@@ -119,27 +119,27 @@ By following these steps, you can seamlessly transform your dataset, define nece
 
    - For the exhaustive examples of value aggregations, see [`/src/MEDS_tabular_automl/utils.py`](https://github.com/mmcdermott/MEDS_Tabular_AutoML/blob/main/src/MEDS_tabular_automl/utils.py#L24)
 
-3. **`meds-tab-tabularize-time-series`**: Iterates through combinations of a shard, `window_size`, and `aggregation` to generate feature vectors that aggregate patient data for each unique `patient_id` x `timestamp`. This stage (and the previous stage) uses sparse matrix formats to efficiently handle the computational and storage demands of rolling window calculations on large datasets. We support parallelization through Hydra's [`--multirun`](https://hydra.cc/docs/intro/#multirun) flag and the [`joblib` launcher](https://hydra.cc/docs/plugins/joblib_launcher/#internaldocs-banner).
+3. **`meds-tab-tabularize-time-series`**: Iterates through combinations of a shard, `window_size`, and `aggregation` to generate feature vectors that aggregate patient data for each unique `subject_id` x `timestamp`. This stage (and the previous stage) uses sparse matrix formats to efficiently handle the computational and storage demands of rolling window calculations on large datasets. We support parallelization through Hydra's [`--multirun`](https://hydra.cc/docs/intro/#multirun) flag and the [`joblib` launcher](https://hydra.cc/docs/plugins/joblib_launcher/#internaldocs-banner).
 
    **Example: Aggregate time-series data** on features across different `window_sizes`
 
    ```console
    meds-tab-tabularize-time-series --multirun \
       worker="range(0,$N_PARALLEL_WORKERS)" \
       hydra/launcher=joblib \
-      MEDS_cohort_dir="path_to_data" \
+      input_dir="path_to_data" \
       tabularization.min_code_inclusion_frequency=10 \
       do_overwrite=False \
       tabularization.window_sizes=[1d,30d,365d,full] \
       tabularization.aggs=[static/present,static/first,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max]
    ```
 
-4. **`meds-tab-cache-task`**: Aligns task-specific labels with the nearest prior event in the tabularized data. It requires a labeled dataset directory with three columns (`patient_id`, `timestamp`, `label`) structured similarly to the `MEDS_cohort_dir`.
+4. **`meds-tab-cache-task`**: Aligns task-specific labels with the nearest prior event in the tabularized data. It requires a labeled dataset directory with three columns (`subject_id`, `timestamp`, `label`) structured similarly to the `input_dir`.
 
-   **Example: Align tabularized data** for a specific task `$TASK` and labels that has pulled from [ACES](https://github.com/justin13601/ACES)
+   **Example: Align tabularized data** for a specific task `$TASK` and labels that have been pulled from [ACES](https://github.com/justin13601/ACES)
 
    ```console
-   meds-tab-cache-task MEDS_cohort_dir="path_to_data" \
+   meds-tab-cache-task input_dir="path_to_data" \
       task_name=$TASK \
       tabularization.min_code_inclusion_frequency=10 \
       do_overwrite=False \
@@ -151,7 +151,7 @@ By following these steps, you can seamlessly transform your dataset, define nece
 
    ```console
    meds-tab-xgboost --multirun \
-      MEDS_cohort_dir="path_to_data" \
+      input_dir="path_to_data" \
       task_name=$TASK \
       output_dir="output_directory" \
       tabularization.min_code_inclusion_frequency=10 \
@@ -321,7 +321,7 @@ Now that we have generated tabular features for all the events in our dataset, w
 - **Row Selection Based on Tasks**: Only the data rows that are relevant to the specific tasks are selected and cached. This reduces the memory footprint and speeds up the training process.
 - **Use of Sparse Matrices for Efficient Storage**: Sparse matrices are again employed here to store the selected data efficiently, ensuring that only non-zero data points are kept in memory, thus optimizing both storage and retrieval times.
 
-The file structure for the cached data mirrors that of the tabular data, also consisting of `.npz` files, where users must specify the directory that stores labels. Labels follow the same shard file structure as the input meds data from step (1), and the label parquets need `patient_id`, `timestamp`, and `label` columns.
+The file structure for the cached data mirrors that of the tabular data, also consisting of `.npz` files, where users must specify the directory that stores labels. Labels follow the same shard file structure as the input meds data from step (1), and the label parquets need `subject_id`, `timestamp`, and `label` columns.
 
 ## 4. XGBoost Training
 
@@ -436,7 +436,7 @@ A single XGBoost run was completed to profile time and memory usage. This was do
 
 ```console
 meds-tab-xgboost
-      MEDS_cohort_dir="path_to_data" \
+      input_dir="path_to_data" \
       task_name=$TASK \
       output_dir="output_directory" \
       do_overwrite=False \
@@ -506,7 +506,7 @@ The XGBoost sweep was run using the following command for each `$TASK`:
 
 ```console
 meds-tab-xgboost --multirun \
-      MEDS_cohort_dir="path_to_data" \
+      input_dir="path_to_data" \
       task_name=$TASK \
       output_dir="output_directory" \
       tabularization.window_sizes=$(generate-subsets [1d,30d,365d,full]) \
@@ -529,14 +529,14 @@ The hydra sweeper swept over the parameters:
 
 ```yaml
 params:
-  +model_params.model.eta: tag(log, interval(0.001, 1))
-  +model_params.model.lambda: tag(log, interval(0.001, 1))
-  +model_params.model.alpha: tag(log, interval(0.001, 1))
-  +model_params.model.subsample: interval(0.5, 1)
-  +model_params.model.min_child_weight: interval(1e-2, 100)
-  +model_params.model.max_depth: range(2, 16)
-  model_params.num_boost_round: range(100, 1000)
-  model_params.early_stopping_rounds: range(1, 10)
+  model.eta: tag(log, interval(0.001, 1))
+  model.lambda: tag(log, interval(0.001, 1))
+  model.alpha: tag(log, interval(0.001, 1))
+  model.subsample: interval(0.5, 1)
+  model.min_child_weight: interval(1e-2, 100)
+  model.max_depth: range(2, 16)
+  num_boost_round: range(100, 1000)
+  early_stopping_rounds: range(1, 10)
   tabularization.min_code_inclusion_frequency: tag(log, range(10, 1000000))
 ```