mmcdermott · Oufattole · Sep 10, 2024 · Sep 9, 2024 · Sep 9, 2024 · Sep 9, 2024
diff --git a/MIMICIV_TUTORIAL/README.MD b/MIMICIV_TUTORIAL/README.MD
@@ -0,0 +1,55 @@
+# MIMIC-IV Example
+
+This is an example of how to extract a MEDS dataset from MIMIC-IV. All scripts in this README are assumed to
+be run **not** from this directory but from the root directory of this entire repository (e.g., one directory
+up from this one).
+
+## Extract MIMIC-IV MEDS Data
+
+### Download pre-extracted data from gpc
+
+Install the [gcloud client](https://cloud.google.com/sdk/docs/install) and then run the following command to download the MEDS data from the gcp bucket:
+
+```console
+export MIMICIV_MEDS_DIR=??? # set to the directory in which you want to store the raw MIMIC-IV data
+export OUTPUT_TABULARIZATION_DIR=??? # set to the output directory for the tabularized data
+export OUTPUT_MODEL_DIR=${OUTPUT_TABULARIZATION_DIR}/results/ # set to the base results directory
+
+cd $MIMICIV_MEDS_DIR
+gcloud storage cp gs://ehr_standardization_schema/MEDS_Extract_v0.0.7_test.zip meds_extract_0.0.7_data.zip
+unzip meds_extract_0.0.7_data.zip
+rm meds_extract_0.0.7_data.zip
+```
+
+```console
+conda create -n meds_tab python=3.12
+conda activate meds_tab
+pip install "meds-tab==0.0.5"
+```
+
+Next we need to get some labels for our tasks. We will use the `long_los` and `icu_mortality` tasks as examples.
+
+### Download pre-extracted labels from gcp:
+
+```console
+TASKS=("long_los" "icu_mortality")
+TASKS_DIR="$MIMICIV_MEDS_DIR/tasks/" # set to the directory in which you want to store all tasks
+
+mkdir -p "${TASKS_DIR}" # create a directory for the task
+
+for TASK_NAME in "${TASKS[@]}"
+do
+    gcloud storage cp "gs://ehr_standardization_schema/benchmark_v1/data/labels/${TASK_NAME}.parquet" "${TASKS_DIR}/${TASK_NAME}/0.parquet"
+done
+```
+
+## Run Tabularization and XGBoost Baseline
+
+```console
+export N_PARALLEL_WORKERS=48 # Set number of workers
+export RESHARD_DIR=??? # set to directory to output reshareded meds data
+bash MIMICIV_TUTORIAL/tabularize_meds.sh "${MIMICIV_MEDS_DIR}" "$RESHARD_DIR" $OUTPUT_TABULARIZATION_DIR \
+    "long_los,icu_mortality" $TASKS_DIR $OUTPUT_MODEL_DIR $N_PARALLEL_WORKERS \
+    "tabularization.aggs=[static/present,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max]" \
+    "tabularization.window_sizes=[2h,12h,1d,7d,30d,365d,full]"
+```
diff --git a/MIMICIV_TUTORIAL/tabularize_meds.sh b/MIMICIV_TUTORIAL/tabularize_meds.sh
@@ -0,0 +1,106 @@
+#!/usr/bin/env bash
+
+set -e
+
+# Function to print help message
+print_help() {
+    echo "Usage: $0 <MIMICIV_MEDS_DIR> <MIMICIV_MEDS_RESHARD_DIR> <OUTPUT_TABULARIZATION_DIR> <TASKS> <TASKS_DIR> <OUTPUT_MODEL_DIR> <N_PARALLEL_WORKERS> [additional arguments]"
+    echo
+    echo "Arguments:"
+    echo "  MIMICIV_MEDS_DIR            Directory containing MIMIC-IV medications data"
+    echo "  MIMICIV_MEDS_RESHARD_DIR    Directory for resharded MIMIC-IV medications data"
+    echo "  OUTPUT_TABULARIZATION_DIR   Output directory for tabularized data"
+    echo "  TASKS                       Comma-separated list of tasks to run (e.g., 'long_los,icu_mortality')"
+    echo "  TASKS_DIR                   Directory containing task-specific data"
+    echo "  OUTPUT_MODEL_DIR            Output directory for models"
+    echo "  N_PARALLEL_WORKERS          Number of parallel workers to use"
+    echo
+    echo "Additional arguments will be passed to the underlying commands."
+}
+
+# Check for help flag
+if [[ "$1" == "--help" || "$1" == "-h" ]]; then
+    print_help
+    exit 0
+fi
+
+# Check if we have the minimum required number of arguments
+if [ "$#" -lt 7 ]; then
+    echo "Error: Not enough arguments provided."
+    print_help
+    exit 1
+fi
+
+# Assign arguments to variables
+MIMICIV_MEDS_DIR="$1"
+MIMICIV_MEDS_RESHARD_DIR="$2"
+OUTPUT_TABULARIZATION_DIR="$3"
+TASKS="$4"
+TASKS_DIR="$5"
+OUTPUT_MODEL_DIR="$6"
+N_PARALLEL_WORKERS="$7"
+
+shift 7
+
+# Split the TASKS string into an array
+IFS=',' read -ra TASK_ARRAY <<< "$TASKS"
+
+# Print input arguments
+echo "Input arguments:"
+echo "MIMICIV_MEDS_DIR: $MIMICIV_MEDS_DIR"
+echo "MIMICIV_MEDS_RESHARD_DIR: $MIMICIV_MEDS_RESHARD_DIR"
+echo "OUTPUT_TABULARIZATION_DIR: $OUTPUT_TABULARIZATION_DIR"
+echo "TASKS:" "${TASK_ARRAY[@]}"
+echo "TASKS_DIR: $TASKS_DIR"
+echo "OUTPUT_MODEL_DIR: $OUTPUT_MODEL_DIR"
+echo "N_PARALLEL_WORKERS: $N_PARALLEL_WORKERS"
+echo "Additional arguments:" "$@"
+echo
+
+# Reshard the data
+echo "Resharding data"
+MEDS_transform-reshard_to_split \
+  --multirun \
+  worker="range(0,6)" \
+  hydra/launcher=joblib \
+  input_dir="$MIMICIV_MEDS_DIR" \
+  cohort_dir="$MIMICIV_MEDS_RESHARD_DIR" \
+  'stages=["reshard_to_split"]' \
+  stage="reshard_to_split" \
+  stage_configs.reshard_to_split.n_subjects_per_shard=2500 \
+  "polling_time=5"
+
+# describe codes
+echo "Describing codes"
+meds-tab-describe \
+    "input_dir=${MIMICIV_MEDS_RESHARD_DIR}/data" "output_dir=$OUTPUT_TABULARIZATION_DIR"
+
+echo "Tabularizing static data"
+meds-tab-tabularize-static \
+    "input_dir=${MIMICIV_MEDS_RESHARD_DIR}/data" "output_dir=$OUTPUT_TABULARIZATION_DIR" \
+    do_overwrite=False "$@"
+
+meds-tab-tabularize-time-series \
+    --multirun \
+    worker="range(0,$N_PARALLEL_WORKERS)" \
+    hydra/launcher=joblib \
+    "input_dir=${MIMICIV_MEDS_RESHARD_DIR}/data" "output_dir=$OUTPUT_TABULARIZATION_DIR" \
+    do_overwrite=False "$@"
+
+for TASK in "${TASK_ARRAY[@]}"
+do
+    echo "Running task_specific_caching.py for task: $TASK"
+    meds-tab-cache-task \
+    hydra/launcher=joblib \
+    "input_dir=${MIMICIV_MEDS_RESHARD_DIR}/data" "output_dir=$OUTPUT_TABULARIZATION_DIR" \
+    "input_label_dir=${TASKS_DIR}/${TASK}/" "task_name=${TASK}" do_overwrite=False "$@"
+
+  echo "Running xgboost for task: $TASK"
+  meds-tab-xgboost \
+      --multirun \
+      worker="range(0,$N_PARALLEL_WORKERS)" \
+      "input_dir=${MIMICIV_MEDS_RESHARD_DIR}/data" "output_dir=$OUTPUT_TABULARIZATION_DIR" \
+      "output_model_dir=${OUTPUT_MODEL_DIR}/${TASK}/" "task_name=$TASK" do_overwrite=False \
+      "hydra.sweeper.n_trials=1000" "hydra.sweeper.n_jobs=${N_PARALLEL_WORKERS}" \
+      "$@"
+done
diff --git a/src/MEDS_tabular_automl/configs/default.yaml b/src/MEDS_tabular_automl/configs/default.yaml
@@ -12,7 +12,7 @@ cache_dir: ${output_dir}/.cache
 hydra:
   verbose: False
   job:
-    name: MEDS_TAB_${name}_${worker}_{now:%Y-%m-%d_%H-%M-%S}
+    name: MEDS_TAB_${name}_${worker}_${now:%Y-%m-%d_%H-%M-%S}
   sweep:
     dir: ${log_dir}
   run:

diff --git a/src/MEDS_tabular_automl/configs/launch_model.yaml b/src/MEDS_tabular_automl/configs/launch_model.yaml
@@ -22,7 +22,7 @@ name: launch_model
 
 hydra:
   sweep:
-    dir: ${output_model_dir}/sweeps/{now:%Y-%m-%d-%H-%M-%S}/
+    dir: ${output_model_dir}/sweeps/${now:%Y-%m-%d-%H-%M-%S}/
     subdir: "1"
   run:
     dir: ${path.model_log_dir}