From b289033b7ffc9173b851021477c317ae02dcd80a Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Mon, 9 Sep 2024 07:29:21 +0000 Subject: [PATCH 1/5] added mimic iv tutorial --- MIMICIV_TUTORIAL/README.MD | 69 +++++++++++++++++++++++++++++ MIMICIV_TUTORIAL/tabularize_meds.sh | 63 ++++++++++++++++++++++++++ 2 files changed, 132 insertions(+) create mode 100644 MIMICIV_TUTORIAL/README.MD create mode 100644 MIMICIV_TUTORIAL/tabularize_meds.sh diff --git a/MIMICIV_TUTORIAL/README.MD b/MIMICIV_TUTORIAL/README.MD new file mode 100644 index 0000000..e8ea9ac --- /dev/null +++ b/MIMICIV_TUTORIAL/README.MD @@ -0,0 +1,69 @@ +# MIMIC-IV Example + +This is an example of how to extract a MEDS dataset from MIMIC-IV. All scripts in this README are assumed to +be run **not** from this directory but from the root directory of this entire repository (e.g., one directory +up from this one). + +## Extract MIMIC-IV MEDS Data + +### Download pre-extracted data from gpc + +Install the [gcloud client](https://cloud.google.com/sdk/docs/install) and then run the following command to download the MEDS data from the gcp bucket: + +```console +export MIMICIV_MEDS_DIR=??? # set to the directory in which you want to store the raw MIMIC-IV data +export OUTPUT_TABULARIZATION_DIR=??? # set to the output directory for the tabularized data +export OUTPUT_MODEL_DIR=${OUTPUT_TABULARIZATION_DIR}/results/ # set to the base results directory + +cd $MIMICIV_MEDS_DIR +gcloud storage cp gs://ehr_standardization_schema/MEDS_Extract_v0.0.7_test.zip meds_extract_0.0.7_data.zip +unzip meds_extract_0.0.7_data.zip +rm meds_extract_0.0.7_data.zip +``` + +```console +conda create -n meds_tab python=3.12 +conda activate meds_tab +pip install "meds-tab==0.0.5" +``` + +Next we need to get some labels for our tasks. We will use the `long_los` and `icu_mortality` tasks as examples. + +### Download pre-extracted labels from gcp: + +```console +TASKS=("long_los" "icu_mortality") +TASKS_DIR="$MIMICIV_MEDS_DIR/tasks/" # set to the directory in which you want to store all tasks + +mkdir -p "${TASKS_DIR}" # create a directory for the task + +for TASK_NAME in "${TASKS[@]}" +do + gcloud storage cp "gs://ehr_standardization_schema/benchmark_v1/data/labels/${TASK_NAME}.parquet" "${TASKS_DIR}/${TASK_NAME}.parquet" +done +``` + +## Pre-Processing for Tabularization + +```console +export N_PARALLEL_WORKERS=48 # Set number of workers +export RESHARD_DIR=??? # set to directory to output reshareded meds data +bash MIMICIV_TUTORIAL/tabularize_meds.sh "${MIMICIV_MEDS_DIR}" "$RESHARD_DIR" $OUTPUT_TABULARIZATION_DIR \ + ["long_los","icu_mortality"] $TASKS_DIR $OUTPUT_MODEL_DIR $N_PARALLEL_WORKERS \ + "tabularization.aggs=[static/present,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max]" \ + "tabularization.window_sizes=[2h,12h,1d,7d,30d,365d,full]" +``` + +## Train XGBOOST Baseline + +```console +meds-tab-model "input_dir=${MIMICIV_MEDS_DIR}/data" "output_dir=$OUTPUT_TABULARIZATION_DIR" output_model_dir=$OUTPUT_MODEL_DIR task_name=long_los + "tabularization.aggs=[static/present,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max]" \ + "tabularization.window_sizes=[2h,12h,1d,7d,30d,365d,full]" +``` + +```console +meds-tab-model "input_dir=${MIMICIV_MEDS_DIR}/data" "output_dir=$OUTPUT_TABULARIZATION_DIR" task_name=icu_mortality + "tabularization.aggs=[static/present,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max]" \ + "tabularization.window_sizes=[2h,12h,1d,7d,30d,365d,full]" +``` diff --git a/MIMICIV_TUTORIAL/tabularize_meds.sh b/MIMICIV_TUTORIAL/tabularize_meds.sh new file mode 100644 index 0000000..221dfb1 --- /dev/null +++ b/MIMICIV_TUTORIAL/tabularize_meds.sh @@ -0,0 +1,63 @@ +#!/usr/bin/env bash + +set -e + +MIMICIV_MEDS_DIR="$1" +MIMICIV_MEDS_RESHARD_DIR="$2" +OUTPUT_TABULARIZATION_DIR="$3" +TASKS="$4" +TASKS_DIR="$5" +OUTPUT_MODEL_DIR="$6" +N_PARALLEL_WORKERS="$7" + +shift 7 + + +IFS=',' read -r -a TASK_ARRAY <<< "$TASKS" + +MEDS_transform-reshard_to_split \ + --multirun \ + worker="range(0,$N_PARALLEL_WORKERS)" \ + hydra/launcher=joblib \ + input_dir="$MIMICIV_MEDS_DIR" \ + cohort_dir="$MIMICIV_MEDS_RESHARD_DIR" \ + 'stages=["reshard_to_split"]' \ + stage="reshard_to_split" \ + stage_configs.reshard_to_split.n_subjects_per_shard=2500 \ + "hydra.sweeper.polling_time=5" + +# describe codes +echo "Describing codes" +meds-tab-describe \ + "input_dir=$MIMICIV_MEDS_RESHARD_DIR" "output_dir=$OUTPUT_TABULARIZATION_DIR" + +echo "Tabularizing static data" +echo meds-tab-tabularize-static \ + "input_dir=$MIMICIV_MEDS_RESHARD_DIR" "output_dir=$OUTPUT_TABULARIZATION_DIR" \ + do_overwrite=False "$@" + +meds-tab-tabularize-time-series \ + --multirun \ + worker="range(0,$N_PARALLEL_WORKERS)" \ + hydra/launcher=joblib \ + "input_dir=$MIMICIV_MEDS_RESHARD_DIR" "output_dir=$OUTPUT_TABULARIZATION_DIR" \ + do_overwrite=False "$@" + + +for TASK in "${TASK_ARRAY[@]}" +do + echo "Running task_specific_caching.py" + meds-tab-cache-task \ + --multirun \ + worker="range(0,$N_PARALLEL_WORKERS)" \ + hydra/launcher=joblib \ + "input_dir=$MIMICIV_MEDS_DIR" "output_dir=$OUTPUT_TABULARIZATION_DIR" \ + "input_label_dir=${TASKS_DIR}" "task_name=${TASK}" do_overwrite=False "$@" + + echo "Running xgboost" + meds-tab-xgboost \ + --multirun \ + worker="range(0,$N_PARALLEL_WORKERS)" \ + "input_dir=$MIMICIV_MEDS_DIR" "output_dir=$OUTPUT_TABULARIZATION_DIR" \ + "output_model_dir=${OUTPUT_MODEL_DIR}/${TASK}/" "task_name=$TASK" do_overwrite=False "$@" +done From 9294920cc1911617abb26f7f91392430ef55f3fd Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Mon, 9 Sep 2024 07:38:54 +0000 Subject: [PATCH 2/5] updated tabularization script to fix bugs --- MIMICIV_TUTORIAL/tabularize_meds.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/MIMICIV_TUTORIAL/tabularize_meds.sh b/MIMICIV_TUTORIAL/tabularize_meds.sh index 221dfb1..5ba5807 100644 --- a/MIMICIV_TUTORIAL/tabularize_meds.sh +++ b/MIMICIV_TUTORIAL/tabularize_meds.sh @@ -24,23 +24,23 @@ MEDS_transform-reshard_to_split \ 'stages=["reshard_to_split"]' \ stage="reshard_to_split" \ stage_configs.reshard_to_split.n_subjects_per_shard=2500 \ - "hydra.sweeper.polling_time=5" + "polling_time=5" # describe codes echo "Describing codes" meds-tab-describe \ - "input_dir=$MIMICIV_MEDS_RESHARD_DIR" "output_dir=$OUTPUT_TABULARIZATION_DIR" + "input_dir=${MIMICIV_MEDS_RESHARD_DIR}/data" "output_dir=$OUTPUT_TABULARIZATION_DIR" echo "Tabularizing static data" echo meds-tab-tabularize-static \ - "input_dir=$MIMICIV_MEDS_RESHARD_DIR" "output_dir=$OUTPUT_TABULARIZATION_DIR" \ + "input_dir=${MIMICIV_MEDS_RESHARD_DIR}/data" "output_dir=$OUTPUT_TABULARIZATION_DIR" \ do_overwrite=False "$@" meds-tab-tabularize-time-series \ --multirun \ worker="range(0,$N_PARALLEL_WORKERS)" \ hydra/launcher=joblib \ - "input_dir=$MIMICIV_MEDS_RESHARD_DIR" "output_dir=$OUTPUT_TABULARIZATION_DIR" \ + "input_dir=${MIMICIV_MEDS_RESHARD_DIR}/data" "output_dir=$OUTPUT_TABULARIZATION_DIR" \ do_overwrite=False "$@" @@ -51,13 +51,13 @@ do --multirun \ worker="range(0,$N_PARALLEL_WORKERS)" \ hydra/launcher=joblib \ - "input_dir=$MIMICIV_MEDS_DIR" "output_dir=$OUTPUT_TABULARIZATION_DIR" \ + "input_dir=${MIMICIV_MEDS_RESHARD_DIR}/data" "output_dir=$OUTPUT_TABULARIZATION_DIR" \ "input_label_dir=${TASKS_DIR}" "task_name=${TASK}" do_overwrite=False "$@" echo "Running xgboost" meds-tab-xgboost \ --multirun \ worker="range(0,$N_PARALLEL_WORKERS)" \ - "input_dir=$MIMICIV_MEDS_DIR" "output_dir=$OUTPUT_TABULARIZATION_DIR" \ + "input_dir=${MIMICIV_MEDS_RESHARD_DIR}/data" "output_dir=$OUTPUT_TABULARIZATION_DIR" \ "output_model_dir=${OUTPUT_MODEL_DIR}/${TASK}/" "task_name=$TASK" do_overwrite=False "$@" done From d71f9dcf602800f779cf2a845a16ca128dd1bd68 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Mon, 9 Sep 2024 07:42:44 +0000 Subject: [PATCH 3/5] reduced the number of workers for resharding --- MIMICIV_TUTORIAL/tabularize_meds.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MIMICIV_TUTORIAL/tabularize_meds.sh b/MIMICIV_TUTORIAL/tabularize_meds.sh index 5ba5807..55f5b4f 100644 --- a/MIMICIV_TUTORIAL/tabularize_meds.sh +++ b/MIMICIV_TUTORIAL/tabularize_meds.sh @@ -17,7 +17,7 @@ IFS=',' read -r -a TASK_ARRAY <<< "$TASKS" MEDS_transform-reshard_to_split \ --multirun \ - worker="range(0,$N_PARALLEL_WORKERS)" \ + worker="range(0,6)" \ hydra/launcher=joblib \ input_dir="$MIMICIV_MEDS_DIR" \ cohort_dir="$MIMICIV_MEDS_RESHARD_DIR" \ From 0dc2bc697a6e91b470e932d3c49760a209fe99f6 Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Mon, 9 Sep 2024 13:42:12 +0000 Subject: [PATCH 4/5] updated tabularize meds to take string input for tasks --- MIMICIV_TUTORIAL/README.MD | 2 +- MIMICIV_TUTORIAL/tabularize_meds.sh | 51 ++++++++++++++++++++++++++--- 2 files changed, 48 insertions(+), 5 deletions(-) diff --git a/MIMICIV_TUTORIAL/README.MD b/MIMICIV_TUTORIAL/README.MD index e8ea9ac..e340a09 100644 --- a/MIMICIV_TUTORIAL/README.MD +++ b/MIMICIV_TUTORIAL/README.MD @@ -49,7 +49,7 @@ done export N_PARALLEL_WORKERS=48 # Set number of workers export RESHARD_DIR=??? # set to directory to output reshareded meds data bash MIMICIV_TUTORIAL/tabularize_meds.sh "${MIMICIV_MEDS_DIR}" "$RESHARD_DIR" $OUTPUT_TABULARIZATION_DIR \ - ["long_los","icu_mortality"] $TASKS_DIR $OUTPUT_MODEL_DIR $N_PARALLEL_WORKERS \ + "long_los,icu_mortality" $TASKS_DIR $OUTPUT_MODEL_DIR $N_PARALLEL_WORKERS \ "tabularization.aggs=[static/present,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max]" \ "tabularization.window_sizes=[2h,12h,1d,7d,30d,365d,full]" ``` diff --git a/MIMICIV_TUTORIAL/tabularize_meds.sh b/MIMICIV_TUTORIAL/tabularize_meds.sh index 55f5b4f..9374c22 100644 --- a/MIMICIV_TUTORIAL/tabularize_meds.sh +++ b/MIMICIV_TUTORIAL/tabularize_meds.sh @@ -2,6 +2,36 @@ set -e +# Function to print help message +print_help() { + echo "Usage: $0 [additional arguments]" + echo + echo "Arguments:" + echo " MIMICIV_MEDS_DIR Directory containing MIMIC-IV medications data" + echo " MIMICIV_MEDS_RESHARD_DIR Directory for resharded MIMIC-IV medications data" + echo " OUTPUT_TABULARIZATION_DIR Output directory for tabularized data" + echo " TASKS Comma-separated list of tasks to run (e.g., 'long_los,icu_mortality')" + echo " TASKS_DIR Directory containing task-specific data" + echo " OUTPUT_MODEL_DIR Output directory for models" + echo " N_PARALLEL_WORKERS Number of parallel workers to use" + echo + echo "Additional arguments will be passed to the underlying commands." +} + +# Check for help flag +if [[ "$1" == "--help" || "$1" == "-h" ]]; then + print_help + exit 0 +fi + +# Check if we have the minimum required number of arguments +if [ "$#" -lt 7 ]; then + echo "Error: Not enough arguments provided." + print_help + exit 1 +fi + +# Assign arguments to variables MIMICIV_MEDS_DIR="$1" MIMICIV_MEDS_RESHARD_DIR="$2" OUTPUT_TABULARIZATION_DIR="$3" @@ -12,9 +42,23 @@ N_PARALLEL_WORKERS="$7" shift 7 +# Split the TASKS string into an array +IFS=',' read -ra TASK_ARRAY <<< "$TASKS" -IFS=',' read -r -a TASK_ARRAY <<< "$TASKS" +# Print input arguments +echo "Input arguments:" +echo "MIMICIV_MEDS_DIR: $MIMICIV_MEDS_DIR" +echo "MIMICIV_MEDS_RESHARD_DIR: $MIMICIV_MEDS_RESHARD_DIR" +echo "OUTPUT_TABULARIZATION_DIR: $OUTPUT_TABULARIZATION_DIR" +echo "TASKS:" "${TASK_ARRAY[@]}" +echo "TASKS_DIR: $TASKS_DIR" +echo "OUTPUT_MODEL_DIR: $OUTPUT_MODEL_DIR" +echo "N_PARALLEL_WORKERS: $N_PARALLEL_WORKERS" +echo "Additional arguments:" "$@" +echo +# Reshard the data +echo "Resharding data" MEDS_transform-reshard_to_split \ --multirun \ worker="range(0,6)" \ @@ -43,10 +87,9 @@ meds-tab-tabularize-time-series \ "input_dir=${MIMICIV_MEDS_RESHARD_DIR}/data" "output_dir=$OUTPUT_TABULARIZATION_DIR" \ do_overwrite=False "$@" - for TASK in "${TASK_ARRAY[@]}" do - echo "Running task_specific_caching.py" + echo "Running task_specific_caching.py for task: $TASK" meds-tab-cache-task \ --multirun \ worker="range(0,$N_PARALLEL_WORKERS)" \ @@ -54,7 +97,7 @@ do "input_dir=${MIMICIV_MEDS_RESHARD_DIR}/data" "output_dir=$OUTPUT_TABULARIZATION_DIR" \ "input_label_dir=${TASKS_DIR}" "task_name=${TASK}" do_overwrite=False "$@" - echo "Running xgboost" + echo "Running xgboost for task: $TASK" meds-tab-xgboost \ --multirun \ worker="range(0,$N_PARALLEL_WORKERS)" \ From be5f723cd895d31f4a42ba67dc6d610995aaed0c Mon Sep 17 00:00:00 2001 From: Nassim Oufattole Date: Tue, 10 Sep 2024 03:21:26 +0000 Subject: [PATCH 5/5] fixed job name config bug where we were missing the $ so it was not resolved. Fixed bugs in e2e meds-tab mimic script --- MIMICIV_TUTORIAL/README.MD | 18 ++---------------- MIMICIV_TUTORIAL/tabularize_meds.sh | 10 +++++----- src/MEDS_tabular_automl/configs/default.yaml | 2 +- .../configs/launch_model.yaml | 2 +- 4 files changed, 9 insertions(+), 23 deletions(-) diff --git a/MIMICIV_TUTORIAL/README.MD b/MIMICIV_TUTORIAL/README.MD index e340a09..5084824 100644 --- a/MIMICIV_TUTORIAL/README.MD +++ b/MIMICIV_TUTORIAL/README.MD @@ -39,11 +39,11 @@ mkdir -p "${TASKS_DIR}" # create a directory for the task for TASK_NAME in "${TASKS[@]}" do - gcloud storage cp "gs://ehr_standardization_schema/benchmark_v1/data/labels/${TASK_NAME}.parquet" "${TASKS_DIR}/${TASK_NAME}.parquet" + gcloud storage cp "gs://ehr_standardization_schema/benchmark_v1/data/labels/${TASK_NAME}.parquet" "${TASKS_DIR}/${TASK_NAME}/0.parquet" done ``` -## Pre-Processing for Tabularization +## Run Tabularization and XGBoost Baseline ```console export N_PARALLEL_WORKERS=48 # Set number of workers @@ -53,17 +53,3 @@ bash MIMICIV_TUTORIAL/tabularize_meds.sh "${MIMICIV_MEDS_DIR}" "$RESHARD_DIR" $O "tabularization.aggs=[static/present,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max]" \ "tabularization.window_sizes=[2h,12h,1d,7d,30d,365d,full]" ``` - -## Train XGBOOST Baseline - -```console -meds-tab-model "input_dir=${MIMICIV_MEDS_DIR}/data" "output_dir=$OUTPUT_TABULARIZATION_DIR" output_model_dir=$OUTPUT_MODEL_DIR task_name=long_los - "tabularization.aggs=[static/present,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max]" \ - "tabularization.window_sizes=[2h,12h,1d,7d,30d,365d,full]" -``` - -```console -meds-tab-model "input_dir=${MIMICIV_MEDS_DIR}/data" "output_dir=$OUTPUT_TABULARIZATION_DIR" task_name=icu_mortality - "tabularization.aggs=[static/present,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max]" \ - "tabularization.window_sizes=[2h,12h,1d,7d,30d,365d,full]" -``` diff --git a/MIMICIV_TUTORIAL/tabularize_meds.sh b/MIMICIV_TUTORIAL/tabularize_meds.sh index 9374c22..d81a9d6 100644 --- a/MIMICIV_TUTORIAL/tabularize_meds.sh +++ b/MIMICIV_TUTORIAL/tabularize_meds.sh @@ -76,7 +76,7 @@ meds-tab-describe \ "input_dir=${MIMICIV_MEDS_RESHARD_DIR}/data" "output_dir=$OUTPUT_TABULARIZATION_DIR" echo "Tabularizing static data" -echo meds-tab-tabularize-static \ +meds-tab-tabularize-static \ "input_dir=${MIMICIV_MEDS_RESHARD_DIR}/data" "output_dir=$OUTPUT_TABULARIZATION_DIR" \ do_overwrite=False "$@" @@ -91,16 +91,16 @@ for TASK in "${TASK_ARRAY[@]}" do echo "Running task_specific_caching.py for task: $TASK" meds-tab-cache-task \ - --multirun \ - worker="range(0,$N_PARALLEL_WORKERS)" \ hydra/launcher=joblib \ "input_dir=${MIMICIV_MEDS_RESHARD_DIR}/data" "output_dir=$OUTPUT_TABULARIZATION_DIR" \ - "input_label_dir=${TASKS_DIR}" "task_name=${TASK}" do_overwrite=False "$@" + "input_label_dir=${TASKS_DIR}/${TASK}/" "task_name=${TASK}" do_overwrite=False "$@" echo "Running xgboost for task: $TASK" meds-tab-xgboost \ --multirun \ worker="range(0,$N_PARALLEL_WORKERS)" \ "input_dir=${MIMICIV_MEDS_RESHARD_DIR}/data" "output_dir=$OUTPUT_TABULARIZATION_DIR" \ - "output_model_dir=${OUTPUT_MODEL_DIR}/${TASK}/" "task_name=$TASK" do_overwrite=False "$@" + "output_model_dir=${OUTPUT_MODEL_DIR}/${TASK}/" "task_name=$TASK" do_overwrite=False \ + "hydra.sweeper.n_trials=1000" "hydra.sweeper.n_jobs=${N_PARALLEL_WORKERS}" \ + "$@" done diff --git a/src/MEDS_tabular_automl/configs/default.yaml b/src/MEDS_tabular_automl/configs/default.yaml index 7d4e392..538bc18 100644 --- a/src/MEDS_tabular_automl/configs/default.yaml +++ b/src/MEDS_tabular_automl/configs/default.yaml @@ -12,7 +12,7 @@ cache_dir: ${output_dir}/.cache hydra: verbose: False job: - name: MEDS_TAB_${name}_${worker}_{now:%Y-%m-%d_%H-%M-%S} + name: MEDS_TAB_${name}_${worker}_${now:%Y-%m-%d_%H-%M-%S} sweep: dir: ${log_dir} run: diff --git a/src/MEDS_tabular_automl/configs/launch_model.yaml b/src/MEDS_tabular_automl/configs/launch_model.yaml index f6dc949..3d886b1 100644 --- a/src/MEDS_tabular_automl/configs/launch_model.yaml +++ b/src/MEDS_tabular_automl/configs/launch_model.yaml @@ -22,7 +22,7 @@ name: launch_model hydra: sweep: - dir: ${output_model_dir}/sweeps/{now:%Y-%m-%d-%H-%M-%S}/ + dir: ${output_model_dir}/sweeps/${now:%Y-%m-%d-%H-%M-%S}/ subdir: "1" run: dir: ${path.model_log_dir}