diff --git a/.gitignore b/.gitignore index 68bc17f..95b15ab 100644 --- a/.gitignore +++ b/.gitignore @@ -70,6 +70,9 @@ instance/ # Sphinx documentation docs/_build/ +docs/source/_autosummary/ +docs/source/api/ +docs/source/bin/ # PyBuilder .pybuilder/ @@ -158,3 +161,5 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ + +docs/source/generated diff --git a/.readthedocs.yaml b/.readthedocs.yaml index fff2f4b..0924093 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -8,6 +8,13 @@ build: python: install: - requirements: docs/requirements.txt + - method: pip + path: . + extra_requirements: + - docs sphinx: configuration: docs/source/conf.py +# Optionally build your docs in additional formats such as PDF +formats: + - pdf diff --git a/README.md b/README.md index 9575b17..086fc04 100644 --- a/README.md +++ b/README.md @@ -16,140 +16,180 @@ This repository provides utilities and scripts to run limited automatic tabular ML pipelines for generic MEDS datasets. -# Installation +______________________________________________________________________ + +# Usage + +This repository consists of two key pieces: + +1. Construction and efficient loading of tabular (flat, non-longitudinal) summary features describing patient records in MEDS over arbitrary time windows (e.g. 1 year, 6 months, etc.), which go backwards in time from a given index date. +2. Running a basic XGBoost AutoML pipeline over these tabular features to predict arbitrary binary classification or regression downstream tasks defined over these datasets. The "AutoML" part of this is not particularly advanced -- what is more advanced is the efficient construction, storage, and loading of tabular features for the candidate AutoML models, enabling a far more extensive search over a much larger total number of features than prior systems. + +## Quick Start To use MEDS-Tab, install the dependencies following commands below: **Pip Install** -```bash +```console pip install meds-tab ``` **Local Install** -``` +```console # clone the git repo pip install . ``` -# Usage +## Scripts and Examples -This repository consists of two key pieces: +For an end to end example over MIMIC-IV, see the [MIMIC-IV companion repository](https://github.com/mmcdermott/MEDS_TAB_MIMIC_IV). +For an end to end example over Philips eICU, see the [eICU companion repository](https://github.com/mmcdermott/MEDS_TAB_EICU). -1. Construction of and efficient loading of tabular (flat, non-longitudinal) summary features describing - patient records in MEDS over arbitrary time-windows (e.g. 1 year, 6 months, etc.) either backwards or - forwards in time from a given index date. Naturally, only "look-back" windows should be used for - future-event prediction tasks, and are thus currently implemented. -2. Running a basic XGBoost AutoML pipeline over these tabular features to predict arbitrary binary classification or regression - downstream tasks defined over these datasets. The "AutoML" part of this is not particularly advanced -- - what is more advanced is the efficient construction, storage, and loading of tabular features for the - candidate AutoML models, enabling a far more extensive search over different featurization strategies. +See [`/tests/test_integration.py`](https://github.com/mmcdermott/MEDS_Tabular_AutoML/blob/main/tests/test_integration.py) for a local example of the end-to-end pipeline being run on synthetic data. This script is a functional test that is also run with `pytest` to verify the correctness of the algorithm. -### Scripts and Examples +## Why MEDS-Tab? -See `tests/test_integration.py` for an example of the end-to-end pipeline being run on synthetic data. This -script is a functional test that is also run with `pytest` to verify the correctness of the algorithm. +MEDS-Tab is a comprehensive framework designed to streamline the handling, modeling, and analysis of complex medical time-series data. By leveraging automated processes, MEDS-Tab significantly reduces the computation required to generate high-quality baseline models for diverse supervised learning tasks. -For an end to end example over MIMIC-IV, see the [companion repository](https://github.com/mmcdermott/MEDS_TAB_MIMIC_IV) -For an end to end example over Philips eICU, see the [eICU companion repository](https://github.com/mmcdermott/MEDS_TAB_EICU). +- Cost Efficiency: MEDS-Tab is dramatically more cost-effective compared to existing solutions +- Strong Performance: MEDS-Tab provides robustness and high performance across various datasets compared with other frameworks. + +### I. Transform to MEDS + +MEDS-Tab leverages the recently developed, minimal, easy-to-use Medical Event Data Standard (MEDS) schema to standardize structured EHR data to a consistent schema from which baselines can be reliably produced across arbitrary tasks and settings. In order to use MEDS-Tab, you will first need to transform your raw EHR data to a MEDS format, which can be done using the following libraries: + +- [MEDS Polars](https://github.com/mmcdermott/MEDS_polars_functions) for a set of functions and scripts for extraction to and transformation/pre-processing of MEDS-formatted data. +- [MEDS ETL](https://github.com/Medical-Event-Data-Standard/meds_etl) for a collection of ETLs from common data formats to MEDS. The package library currently supports MIMIC-IV, OMOP v5, and MEDS FLAT (a flat version of MEDS). + +### II. Run MEDS-Tab + +- Run the MEDS-Tab Command-Line Interface tool (`MEDS-Tab-cli`) to extract cohorts based on your task - check out the [Usage Guide](https://meds-tab--36.org.readthedocs.build/en/36/overview.html#core-cli-scripts-overview)! -### Core CLI Scripts Overview +- Painless Reproducibility: Use [MEDS-Tab](https://github.com/mmcdermott/MEDS_TAB_MIMIC_IV/tree/main/tasks) to obtain comparable, reproducible, and well-tuned XGBoost results tailored to your dataset-specific feature space! -1. **`meds-tab-describe`**: This command processes MEDS data shards to compute the frequencies of different code-types +By following these steps, you can seamlessly transform your dataset, define necessary criteria, and leverage powerful machine learning tools within the MEDS-Tab ecosystem. This approach not only simplifies the process but also ensures high-quality, reproducible results for your machine learning tasks for health projects. It can reliably take no more than a week of full-time human effort to perform Steps I-V on new datasets in reasonable raw formulations! + +## Core CLI Scripts Overview + +1. **`meds-tab-describe`**: This command processes MEDS data shards to compute the frequencies of different code-types. It differentiates codes into the following categories: - time-series codes (codes with timestamps) - time-series numerical values (codes with timestamps and numerical values) - static codes (codes without timestamps) - static numerical codes (codes without timestamps but with numerical values). - **Caching feature names and frequencies** in a dataset stored in `"path_to_data"` - - ``` - meds-tab-describe MEDS_cohort_dir="path_to_data" - ``` + This script further caches feature names and frequencies in a dataset stored in a `code_metadata.parquet` file within the `MEDS_cohort_dir` argument specified as a hydra-style command line argument. 2. **`meds-tab-tabularize-static`**: Filters and processes the dataset based on the frequency of codes, generating a tabular vector for each patient at each timestamp in the shards. Each row corresponds to a unique `patient_id` and `timestamp` combination, thus rows are duplicated across multiple timestamps for the same patient. - **Tabularizing static data** with the minimum code frequency of 10 and window sizes of `[1d, 30d, 365d, full]` and value aggregation methods of `[static/present, code/count, value/count, value/sum, value/sum_sqd, value/min, value/max]` + **Example: Tabularizing static data** with the minimum code frequency of 10, window sizes of `[1d, 30d, 365d, full]`, and value aggregation methods of `[static/present, static/first, code/count, value/count, value/sum, value/sum_sqd, value/min, value/max]` - ``` + ```console meds-tab-tabularize-static MEDS_cohort_dir="path_to_data" \ tabularization.min_code_inclusion_frequency=10 \ tabularization.window_sizes=[1d,30d,365d,full] \ do_overwrite=False \ - tabularization.aggs=[static/present,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max]" + tabularization.aggs=[static/present,static/first,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max]" ``` -3. **`meds-tab-tabularize-time-series`**: Iterates through combinations of a shard, `window_size`, and `aggregation` to generate feature vectors that aggregate patient data for each unique `patient_id` x `timestamp`. This stage (and the previous stage) use sparse matrix formats to efficiently handle the computational and storage demands of rolling window calculations on large datasets. We support parallelization through Hydra's `--multirun` flag and the `joblib` launcher. + - For the exhuastive examples of value aggregations, see [`/src/MEDS_tabular_automl/utils.py`](https://github.com/mmcdermott/MEDS_Tabular_AutoML/blob/main/src/MEDS_tabular_automl/utils.py#L24) - **Aggregates time-series data** on features across different `window_sizes` +3. **`meds-tab-tabularize-time-series`**: Iterates through combinations of a shard, `window_size`, and `aggregation` to generate feature vectors that aggregate patient data for each unique `patient_id` x `timestamp`. This stage (and the previous stage) use sparse matrix formats to efficiently handle the computational and storage demands of rolling window calculations on large datasets. We support parallelization through Hydra's [`--multirun`](https://hydra.cc/docs/intro/#multirun) flag and the [`joblib` launcher](https://hydra.cc/docs/plugins/joblib_launcher/#internaldocs-banner). - ``` + **Example: Aggregate time-series data** on features across different `window_sizes` + + ```console meds-tab-tabularize-time-series --multirun \ - worker="range(0,$N_PARALLEL_WORKERS)" \ - hydra/launcher=joblib \ - MEDS_cohort_dir="path_to_data" \ - tabularization.min_code_inclusion_frequency=10 \ - do_overwrite=False \ - tabularization.window_sizes=[1d,30d,365d,full] \ - tabularization.aggs=[static/present,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max] + worker="range(0,$N_PARALLEL_WORKERS)" \ + hydra/launcher=joblib \ + MEDS_cohort_dir="path_to_data" \ + tabularization.min_code_inclusion_frequency=10 \ + do_overwrite=False \ + tabularization.window_sizes=[1d,30d,365d,full] \ + tabularization.aggs=[static/present,static/first,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max] ``` 4. **`meds-tab-cache-task`**: Aligns task-specific labels with the nearest prior event in the tabularized data. It requires a labeled dataset directory with three columns (`patient_id`, `timestamp`, `label`) structured similarly to the `MEDS_cohort_dir`. - **Aligh tabularized data** for a specific task `$TASK` and labels that has pulled from [ACES](https://github.com/justin13601/ACES) + **Example: Align tabularized data** for a specific task `$TASK` and labels that has pulled from [ACES](https://github.com/justin13601/ACES) - ``` + ```console meds-tab-cache-task MEDS_cohort_dir="path_to_data" \ - task_name=$TASK \ - tabularization.min_code_inclusion_frequency=10 \ - do_overwrite=False \ - tabularization.window_sizes=[1d,30d,365d,full] \ - tabularization.aggs=[static/present,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max] + task_name=$TASK \ + tabularization.min_code_inclusion_frequency=10 \ + do_overwrite=False \ + tabularization.window_sizes=[1d,30d,365d,full] \ + tabularization.aggs=[static/present,static/first,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max] ``` 5. **`meds-tab-xgboost`**: Trains an XGBoost model using user-specified parameters. Permutations of `window_sizes` and `aggs` can be generated using `generate-permutations` command (See the section below for descriptions). - ``` + ```console meds-tab-xgboost --multirun \ - MEDS_cohort_dir="path_to_data" \ - task_name=$TASK \ - output_dir="output_directory" \ - tabularization.min_code_inclusion_frequency=10 \ - tabularization.window_sizes=$(generate-permutations [1d,30d,365d,full]) \ - do_overwrite=False \ - tabularization.aggs=$(generate-permutations [static/present,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max]) + MEDS_cohort_dir="path_to_data" \ + task_name=$TASK \ + output_dir="output_directory" \ + tabularization.min_code_inclusion_frequency=10 \ + tabularization.window_sizes=$(generate-permutations [1d,30d,365d,full]) \ + do_overwrite=False \ + tabularization.aggs=$(generate-permutations [static/present,static/first,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max]) ``` -6. **`meds-tab-xgboost-sweep`**: Conducts an Optuna hyperparameter sweep to optimize over `window_sizes`, `aggregations`, and `min_code_inclusion_frequency`, aiming to enhance model performance and adaptability. - -### Additional CLI Scripts +## Additional CLI Scripts 1. **`generate-permutations`**: Generates and prints a sorted list of all permutations from a comma separated input. This is provided for the convenience of sweeping over all possible combinations of window sizes and aggregations. For example you can directly call **`generate-permutations`** in the command line: - ```bash - generate-permutations [2,3,4] + ```console + generate-permutations [2,3,4] \ [2], [2, 3], [2, 3, 4], [2, 4], [3], [3, 4], [4] ``` This could be used in the command line in concert with other calls. For example, the following call: - ```bash + ```console meds-tab-xgboost --multirun tabularization.window_sizes=$(generate-permutations [1d,2d,7d,full]) ``` would resolve to: - ```bash + ```console meds-tab-xgboost --multirun tabularization.window_sizes=[1d],[1d,2d],[1d,2d,7d],[1d,2d,7d,full],[1d,2d,full],[1d,7d],[1d,7d,full],[1d,full],[2d],[2d,7d],[2d,7d,full],[2d,full],[7d],[7d,full],[full] ``` -# How does MEDS-Tab Work? + which can then be correctly interpreted by Hydra's multirun logic. + +## Roadmap + +MEDS-Tab has several key limitations which we plan to address in future changes. These include, and are tracked by, the following GitHub issues. + +### Improvements to the core tabularization + +1. Further memory and runtime improvements are possible: [#16](https://github.com/mmcdermott/MEDS_Tabular_AutoML/issues/16) +2. We should support additional window sizes and types: [#31](https://github.com/mmcdermott/MEDS_Tabular_AutoML/issues/31) +3. We should support additional aggregation functions: [#32](https://github.com/mmcdermott/MEDS_Tabular_AutoML/issues/32) -#### What do you mean "tabular pipelines"? Isn't _all_ structured EHR data already tabular? +### Improvements to the modeling pipeline + +1. We should likely decorrelate the default aggregations and/or window sizes we use prior to passing them into the models as features: [#27](https://github.com/mmcdermott/MEDS_Tabular_AutoML/issues/27) +2. We need to do a detailed parameter study over the hyperparameter sweep options to find good defaults for these kinds of problems and models: [#33](https://github.com/mmcdermott/MEDS_Tabular_AutoML/issues/33) +3. We should support a more extensive set of pipeline operations and model architectures: [#37](https://github.com/mmcdermott/MEDS_Tabular_AutoML/issues/37) + +### Technical debt / code improvements + +1. The computation and use of the code metadata dataframe, containing frequencies of codes, should be offloaded to core MEDS functionality, with the remaining code in this repository cleaned up. + - [#28](https://github.com/mmcdermott/MEDS_Tabular_AutoML/issues/28) + - [#14](https://github.com/mmcdermott/MEDS_Tabular_AutoML/issues/14) +2. We should add more doctests and push test coverage up to 100% + - [#29](https://github.com/mmcdermott/MEDS_Tabular_AutoML/issues/29) + - [#30](https://github.com/mmcdermott/MEDS_Tabular_AutoML/issues/30) +3. We need to ensure full and seamless compatibility with the ACES CLI tool, rather than relying on the python API and manual adjustments: + [#34](https://github.com/mmcdermott/MEDS_Tabular_AutoML/issues/34) + +## What do you mean "tabular pipelines"? Isn't _all_ structured EHR data already tabular? This is a common misconception. _Tabular_ data refers to data that can be organized in a consistent, logical set of rows/columns such that the entirety of a "sample" or "instance" for modeling or analysis is contained @@ -162,12 +202,404 @@ realize these constraints, which will explicitly involve summarizing the patient or future windows in time to produce a single row per patient with a consistent, logical set of columns (though there may still be missingness). -## Implementation Improvements +______________________________________________________________________ + +# The MEDS-Tab Architecture + +In this section, we describe the MEDS-Tab architecture, specifically some of the pipeline choices we made to reduce memory usage and increase speed during the tabularization process and XGBoost tuning process. + +We break our method into 4 discrete parts: + +1. Describe codes (compute feature frequencies) +2. Tabularization of time-series data +3. Efficient data caching for task-specific rows +4. XGBoost training + +## 1. Describe Codes (compute feature frequencies) + +This initial stage processes a pre-shareded dataset. We expect a structure as follows where each shard contains a subset of the patients: + +```text +/PATH/TO/MEDS/DATA +│ +└─── +│ │ .parquet +│ │ .parquet +│ │ ... +│ +└─── +│ │ .parquet +│ │ .parquet +| │ ... +| +... +``` + +We then compute and store feature frequencies, crucial for determining which features are relevant for further analysis. + +**Detailed Workflow:** + +- **Data Loading and Sharding**: We iterate through shards to compute feature frequencies for each shard. +- **Frequency Aggregation**: After computing frequencies across shards, we aggregate them to get a final count of each feature across the entire dataset training dataset, which allows us to filter out infrequent features in the tabularization stage or when tuning XGBoost. + +## 2. Tabularization of Time-Series Data + +### Overview + +The tabularization stage of our pipeline, exposed via the cli commands: + +- `meds-tab-tabularize-static` for tabularizing static data +- and `meds-tab-tabularize-time-series` for tabularizing the time series data + +Static data is relatively small in the medical datasets, so we use a dense pivot operation, convert it to a sparse matrix, and then duplicate rows such that the static data will match up with the time series data rows generated in the next step. Static data is currently processed serially. + +The script for tabularizing time series data primarily transforms a raw, unstructured dataset into a structured, feature-rich dataset by utilizing a series of sophisticated data processing steps. This transformation (as depicted in the figure below) involves converting raw time series from a Polars dataframe into a sparse matrix format, aggregating events that occur at the same date for the same patient, and then applying rolling window aggregations to extract temporal features. + +![Time Series Tabularization Method](../assets/pivot.png) + +### High-Level Tabularization Algorithm + +1. **Data Loading and Categorization**: + + - The script iterates through shards of patients, and shards can be processed in parallel using hydras joblib to launch multiple processes. + +2. **Sparse Matrix Conversion**: + + - Data from the Polars dataframe is converted into a sparse matrix format, where each row represents a unique event (patient x timestamp), and each column corresponds to a MEDS code for the patient. + +3. **Rolling Window Aggregation**: + + - For each aggregation method (sum, count, min, max, etc.), events that occur on the same date for the same patient are aggregated. This reduces the amount of data we have to perform rolling windows over. + - Then we aggregate features over the specified rolling windows sizes. + +4. **Output Storage**: + + - Sparse array is converted to Coordinate List format and stored as a `.npz` file on disk. + - The file paths look as follows + +```text +/PATH/TO/MEDS/TABULAR_DATA +│ +└─── + ├─── + │ ├───code + │ │ └───count.npz + │ └───value + │ └───sum.npz + ... +``` + +## 3. Efficient Data Caching for Task-Specific Rows + +Now that we have generated tabular features for all the events in our dataset, we can cache subsets relevant for each task we wish to train a supervised model on. This step is critical for efficiently training machine learning models on task-specific data without having to load the entire dataset. + +**Detailed Workflow:** + +- **Row Selection Based on Tasks**: Only the data rows that are relevant to the specific tasks are selected and cached. This reduces the memory footprint and speeds up the training process. +- **Use of Sparse Matrices for Efficient Storage**: Sparse matrices are again employed here to store the selected data efficiently, ensuring that only non-zero data points are kept in memory, thus optimizing both storage and retrieval times. + +The file structure for the cached data mirrors that of the tabular data, also consisting of `.npz` files, where users must specify the directory that stores labels. Labels follow the same shard filestructure as the input meds data from step (1), and the label parquets need `patient_id`, `timestamp`, and `label` columns. + +## 4. XGBoost Training + +The final stage uses the processed and cached data to train an XGBoost model. This stage is optimized to handle the sparse data structures produced in earlier stages efficiently. + +**Detailed Workflow:** + +- **Iterator for Data Loading**: Custom iterators are designed to load sparse matrices efficiently into the XGBoost training process, which can handle sparse inputs natively, thus maintaining high computational efficiency. +- **Training and Validation**: The model is trained using the tabular data, with evaluation steps that include early stopping to prevent overfitting and tuning of hyperparameters based on validation performance. +- **Hyperaparameter Tuning**: We use [optuna](https://optuna.org/) to tune over XGBoost model pramters, aggregations, window sizes, and the minimimum code inclusion frequency. + +______________________________________________________________________ # Computational Performance vs. Existing Pipelines -# XGBoost Performance +Evaluating the computational overhead of tabularization methods is essential for assessing their efficiency and suitability for large-scale medical data processing. This section presents a comparative analysis of the computational overhead of MEDS-Tab with other systems like Catabra and TSFresh. It outlines the performance of each system in terms of wall time, memory usage, and output size, highlighting the computational efficiency and scalability of MEDS-Tab. + +## 1. System Comparison Overview + +The systems compared in this study represent different approaches to data tabularization, with the main difference being MEDS-Tab usage of sparse tabularization. Specifically, for comparison we used: + +1. **Catabra/Catabra-Mem**: Offers data processing capabilities for time-series medical data, with variations to test memory management. +2. **TSFresh**: Both known and used for extensive feature extraction capabilities. + +The benchmarking tests were conducted using the following hardware and software settings: + +- **CPU Specification**: 2 x AMD EPYC 7713 64-Core Processor +- **RAM Specification**: 1024GB, 3200MHz, DDR4 +- **Software Environment**: Ubuntu 22.04.4 LTS + +### MEDS-Tab Tabularization Technique + +Tabularization of time-series data, as depecited above, is commonly used in several past works. The only two libraries to our knowledge that provide a full tabularization pipeline are `tsfresh` and `catabra`. `catabra` also offers a slower but more memory efficient version of their method which we denote `catabra-mem`. Other libraries either provide only rolling window functionalities (`featuretools`) or just pivoting operations (`Temporai`/`Clairvoyance`, `sktime`, `AutoTS`). We provide a significantly faster and more memory efficient method. Our findings show that on the MIMIC-IV and eICU medical datasets we significantly outperform both above-mentioned methods that provide similar functionalities with MEDS-Tab. While `catabra` and `tsfresh` could not even run within a budget of 10 minutes on as low as 10 patient's data for eICU, our method scales to process hundreds of patients with low memory usage under the same time budget. We present the results below. + +## 2. Comparative Performance Analysis + +The tables below detail computational resource utilization across two datasets and various patient scales, emphasizing the better performance of MEDS-Tab in all of the scenarios. The tables are organized by dataset and number of patients. For the analysis, the full window sizes and the aggregation method code_count were used. Additionally, we use a budget of 10 minutes for running our tests given that for such small number of patients (10, 100, and 500 patients) data should be processed quickly. Note that `catabra-mem` is omitted from the tables as it never completed within the 10 minute budget. + +### eICU Dataset + +The only method that was able to tabularize eICU data was MEDS-Tab. We ran our method with both 100 and 500 patients, resulting in an increment by three times in the number of codes. MEDS-Tab gave efficient results in terms of both time and memory usage. + +a) 100 Patients + +**Table 1: 6,374 Codes, 2,065,608 Rows, Output Shape \[132,461, 6,374\]** + +| Wall Time | Avg Memory | Peak Memory | Output Size | Method | +| --------- | ---------- | ----------- | ----------- | -------- | +| 0m39s | 5,271 MB | 14,791 MB | 362 MB | meds_tab | + +b) 500 Patients + +**Table 2: 18,314 Codes, 8,737,355 Rows, Output Shape \[565,014, 18,314\]** + +| Wall Time | Avg Memory | Peak Memory | Output Size | Method | +| --------- | ---------- | ----------- | ----------- | -------- | +| 3m4s | 8,335 MB | 15,102 MB | 1,326 MB | meds_tab | + +### MIMIC-IV Dataset + +MEDS-Tab, `tsfresh`, and `catabra` were tested across three different patient scales on MIMIC-IV. -## XGBoost Model Performance on MIMIC-IV +a) 10 Patients + +This table illustrates the efficiency of MEDS-Tab in processing a small subset of patients with extremely low computational cost and high data throughput, outperforming `tsfresh` and `catabra` in terms of both time and memory efficiency. + +**Table 3: 1,504 Codes, 23,346 Rows, Output Shape \[2,127, 1,504\]** + +| Wall Time | Avg Memory | Peak Memory | Output Size | Method | +| --------- | ---------- | ----------- | ----------- | -------- | +| 0m2s | 423 MB | 943 MB | 7 MB | meds_tab | +| 1m41s | 84,159 MB | 265,877 MB | 1 MB | tsfresh | +| 0m15s | 2,537 MB | 4,781 MB | 1 MB | catabra | + +b) 100 Patients + +The performance gap was further highlighted with an increased number of patients and codes. For a moderate patient count, MEDS-Tab demonstrated superior performance with significantly lower wall times and memory usage compared to `tsfresh` and `catabra`. + +**Table 4: 4,154 Codes, 150,789 Rows, Output Shape \[15,664, 4,154\]** + +| Wall Time | Avg Memory | Peak Memory | Output Size | Method | +| --------- | ---------- | ----------- | ----------- | -------- | +| 0m5s | 718 MB | 1,167 MB | 45 MB | meds_tab | +| 5m9s | 217,477 MB | 659,735 MB | 4 MB | tsfresh | +| 3m17s | 14,319 MB | 28,342 MB | 4 MB | catabra | + +c) 500 Patients + +Scaling further to 500 patients, MEDS-Tab maintained consistent performance, reinforcing its capability to manage large datasets efficiently. Because of the set time limit of 10 minutes, we could not get results for `catabra` and `tsfresh`. In comparison, MEDS-Tab processed the data in about 15 seconds, making it at least 40 times faster for the given patient scale. + +**Table 5: 48,115 Codes, 795,368 Rows, Output Shape \[75,595, 8,115\]** + +| Wall Time | Avg Memory | Peak Memory | Output Size | Method | +| --------- | ---------- | ----------- | ----------- | -------- | +| 0m16s | 1,410 MB | 3,539 MB | 442 MB | meds_tab | + +______________________________________________________________________ + +# Prediction Performance + +## XGBoost Model Performance on MIMIC-IV Tasks + +Evaluating our tabularization approach for baseline models involved training XGBoost across a spectrum of binary clinical prediction tasks, using data from the MIMIC-IV database. These tasks encompassed diverse outcomes such as mortality predictions over different intervals, readmission predictions, and lengths of stay (LOS) in both ICU and hospital settings. + +Each task is characterized by its specific label and prediction time. For instance, predicting "30-day readmission" involves assessing whether a patient returns to the hospital within 30 days, with predictions made at the time of discharge. This allows input features to be derived from the entire duration of the patient's admission. In contrast, tasks like "In ICU Mortality" focus on predicting the occurrence of death using only data from the first 24 or 48 hours of ICU admission. Specifically, we use the terminology "Index Timestamp" to mean the timestamp such that no event included as input will occur later than this point. + +We optimize predictive accuracy and model performance by using varied window sizes and aggregations of patient data. This approach allows us to effectively capture and leverage the temporal dynamics and clinical nuances inherent in each prediction task. + +### 1. XGBoost Time and Memory Profiling on MIMIC-IV + +A single XGBoost run was completed to profile time and memory usage. This was done for each `$TASK` using the following command: + +```console +meds-tab-xgboost + MEDS_cohort_dir="path_to_data" \ + task_name=$TASK \ + output_dir="output_directory" \ + do_overwrite=False \ +``` + +This uses the defaults minimum code inclusion frequency, window sizes, and aggregations from the `launch_xgboost.yaml`: + +```yaml +allowed_codes: # allows all codes that meet min code inclusion frequency +min_code_inclusion_frequency: 10 +window_sizes: + - 1d + - 7d + - 30d + - 365d + - full +aggs: + - static/present + - static/first + - code/count + - value/count + - value/sum + - value/sum_sqd + - value/min + - value/max +``` + +Since this includes every window size and aggregation, it is the most expensive to run. The runtimes and memory usage are reported below. + +#### 1.1 XGBoost Runtimes and Memory Usage on MIMIC-IV Tasks + +| Task | Index Timestamp | Real Time | User Time | Sys Time | Avg Memory (MiB) | Peak Memory (MiB) | +| ------------------------------- | ----------------- | --------- | --------- | -------- | ---------------- | ----------------- | +| Post-discharge 30 day Mortality | Discharge | 2m59s | 3m38s | 0m38s | 9,037 | 11,955 | +| Post-discharge 1 year Mortality | Discharge | 5m16s | 6m10s | 0m59s | 10,804 | 12,330 | +| 30 day Readmission | Discharge | 2m30s | 3m3s | 0m39s | 13,199 | 18,677 | +| In ICU Mortality | Admission + 24 hr | 0m38s | 1m3s | 0m13s | 1,712 | 2,986 | +| In ICU Mortality | Admission + 48 hr | 0m34s | 1m1s | 0m13s | 1,613 | 2,770 | +| In Hospital Mortality | Admission + 24 hr | 2m8s | 2m41s | 0m32s | 9,072 | 12,056 | +| In Hospital Mortality | Admission + 48 hr | 1m54s | 2m25s | 0m29s | 8,858 | 12,371 | +| LOS in ICU > 3 days | Admission + 24 hr | 2m3s | 2m37s | 0m28s | 4,650 | 5,715 | +| LOS in ICU > 3 days | Admission + 48 hr | 1m44s | 2m18s | 0m24s | 4,453 | 5,577 | +| LOS in Hospital > 3 days | Admission + 24 hr | 6m5s | 7m5s | 1m4s | 11,012 | 12,223 | +| LOS in Hospital > 3 days | Admission + 48 hr | 6m10s | 7m12s | 1m4s | 10,703 | 11,830 | + +#### 1.2 MIMIC-IV Task Specific Training Cohort Size + +To better understand the runtimes, we also report the task specific cohort size. + +| Task | Index Timestamp | Number of Patients | Number of Events | +| ------------------------------- | ----------------- | ------------------ | ---------------- | +| Post-discharge 30 day Mortality | Discharge | 149,014 | 356,398 | +| Post-discharge 1 year Mortality | Discharge | 149,014 | 356,398 | +| 30 day Readmission | Discharge | 17,418 | 377,785 | +| In ICU Mortality | Admission + 24 hr | 7,839 | 22,811 | +| In ICU Mortality | Admission + 48 hr | 6,750 | 20,802 | +| In Hospital Mortality | Admission + 24 hr | 51,340 | 338,614 | +| In Hospital Mortality | Admission + 48 hr | 47,231 | 348,289 | +| LOS in ICU > 3 days | Admission + 24 hr | 42,809 | 61,342 | +| LOS in ICU > 3 days | Admission + 48 hr | 42,805 | 61,327 | +| LOS in Hospital > 3 days | Admission + 24 hr | 152,126 | 360,208 | +| LOS in Hospital > 3 days | Admission + 48 hr | 152,120 | 359,020 | + +### 2. MIMIC-IV Sweep + +The XGBoost sweep was run using the following command for each `$TASK`: + +```console +meds-tab-xgboost --multirun \ + MEDS_cohort_dir="path_to_data" \ + task_name=$TASK \ + output_dir="output_directory" \ + tabularization.window_sizes=$(generate-permutations [1d,30d,365d,full]) \ + do_overwrite=False \ + tabularization.aggs=$(generate-permutations [static/present,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max]) +``` + +The model parameters were set to: + +```yaml +model: + booster: gbtree + device: cpu + nthread: 1 + tree_method: hist + objective: binary:logistic +``` + +The hydra sweeper swept over the parameters: + +```yaml +params: + +model_params.model.eta: tag(log, interval(0.001, 1)) + +model_params.model.lambda: tag(log, interval(0.001, 1)) + +model_params.model.alpha: tag(log, interval(0.001, 1)) + +model_params.model.subsample: interval(0.5, 1) + +model_params.model.min_child_weight: interval(1e-2, 100) + +model_params.model.max_depth: range(2, 16) + model_params.num_boost_round: range(100, 1000) + model_params.early_stopping_rounds: range(1, 10) + tabularization.min_code_inclusion_frequency: tag(log, range(10, 1000000)) +``` -## XGBoost Model Performance on Philips eICU +Note that the XGBoost command shown includes `tabularization.window_sizes` and ` tabularization.aggs` in the parameters to sweep over. + +For a complete example on MIMIC-IV and for all of our config files, see the [MIMIC-IV companion repository](https://github.com/mmcdermott/MEDS_TAB_MIMIC_IV). + +#### 2.1 XGBoost Performance on MIMIC-IV + +| Task | Index Timestamp | AUC | Minimum Code Inclusion Frequency | Number of Included Codes\* | Window Sizes | Aggregations | +| ------------------------------- | ----------------- | ----- | -------------------------------- | -------------------------- | ---------------------- | --------------------------------------------------------------------------- | +| Post-discharge 30 day Mortality | Discharge | 0.935 | 1,371 | 5,712 | \[7d,full\] | \[code/count,value/count,value/min,value/max\] | +| Post-discharge 1 year Mortality | Discharge | 0.898 | 289 | 10,048 | \[2h,12h,1d,30d,full\] | \[static/present,code/count,value/sum_sqd,value/min\] | +| 30 day Readmission | Discharge | 0.708 | 303 | 9,903 | \[30d,365d,full\] | \[code/count,value/count,value/sum,value/sum_sqd,value/max\] | +| In ICU Mortality | Admission + 24 hr | 0.661 | 7,059 | 3,037 | \[12h,full\] | \[static/present,code/count,value/sum,value/min,value/max\] | +| In ICU Mortality | Admission + 48 hr | 0.673 | 71 | 16,112 | \[1d,7d,full\] | \[static/present,code/count,value/sum,value/min,value/max\] | +| In Hospital Mortality | Admission + 24 hr | 0.812 | 43 | 18,989 | \[1d,full\] | \[static/present,code/count,value/sum,value/min,value/max\] | +| In Hospital Mortality | Admission + 48 hr | 0.810 | 678 | 7,433 | \[1d,full\] | \[static/present,code/count,value/count\] | +| LOS in ICU > 3 days | Admission + 24 hr | 0.946 | 30,443 | 1,624 | \[2h,7d,30d\] | \[static/present,code/count,value/count,value/sum,value/sum_sqd,value/max\] | +| LOS in ICU > 3 days | Admission + 48 hr | 0.967 | 2,864 | 4,332 | \[2h,7d,30d\] | \[code/count,value/sum_sqd,value/max\] | +| LOS in Hospital > 3 days | Admission + 24 hr | 0.943 | 94,633 | 912 | \[12h,1d,7d\] | \[code/count,value/count,value/sum_sqd\] | +| LOS in Hospital > 3 days | Admission + 48 hr | 0.945 | 30,880 | 1,619 | \[1d,7d,30d\] | \[code/count,value/sum,value/min,value/max\] | + +- Number of Included Codes is based on Minimum Code Inclusion Frequency -- we calculated the number of resulting codes that were above the minimum threshold and reported that. + +#### 2.2 XGBoost Optimal Found Model Parameters + +Additionally, the model parameters from the highest performing run are reported below. + +| Task | Index Timestamp | Eta | Lambda | Alpha | Subsample | Minimum Child Weight | Number of Boosting Rounds | Early Stopping Rounds | Max Tree Depth | +| ------------------------------- | ----------------- | ----- | ------ | ----- | --------- | -------------------- | ------------------------- | --------------------- | -------------- | +| Post-discharge 30 day Mortality | Discharge | 0.006 | 0.032 | 0.374 | 0.572 | 53 | 703 | 9 | 16 | +| Post-discharge 1 year Mortality | Discharge | 0.009 | 0.086 | 0.343 | 0.899 | 76 | 858 | 9 | 11 | +| 30 day Readmission | Discharge | 0.006 | 0.359 | 0.374 | 0.673 | 53 | 712 | 9 | 16 | +| In ICU Mortality | Admission + 24 hr | 0.038 | 0.062 | 0.231 | 0.995 | 89 | 513 | 7 | 14 | +| In ICU Mortality (first 48h) | Admission + 48 hr | 0.044 | 0.041 | 0.289 | 0.961 | 91 | 484 | 5 | 14 | +| In Hospital Mortality | Admission + 24 hr | 0.028 | 0.013 | 0.011 | 0.567 | 11 | 454 | 6 | 9 | +| In Hospital Mortality | Admission + 48 hr | 0.011 | 0.060 | 0.179 | 0.964 | 84 | 631 | 7 | 13 | +| LOS in ICU > 3 days | Admission + 24 hr | 0.012 | 0.090 | 0.137 | 0.626 | 26 | 650 | 8 | 14 | +| LOS in ICU > 3 days | Admission + 48 hr | 0.012 | 0.049 | 0.200 | 0.960 | 84 | 615 | 7 | 13 | +| LOS in Hospital > 3 days | Admission + 24 hr | 0.008 | 0.067 | 0.255 | 0.989 | 90 | 526 | 5 | 14 | +| LOS in Hospital > 3 days | Admission + 48 hr | 0.001 | 0.030 | 0.028 | 0.967 | 9 | 538 | 8 | 7 | + +## XGBoost Model Performance on eICU Tasks + +### eICU Sweep + +The eICU sweep was conducted equivalently to the MIMIC-IV sweep. Please refer to the MIMIC-IV Sweep subsection above for details on the commands and sweep parameters. + +For more details about eICU specific task generation and running, see the [eICU companion repository](https://github.com/mmcdermott/MEDS_TAB_EICU). + +#### 1. XGBoost Performance on eICU + +| Task | Index Timestamp | AUC | Minimum Code Inclusion Frequency | Window Sizes | Aggregations | +| ------------------------------- | ----------------- | ----- | -------------------------------- | ------------------------ | -------------------------------------------------------------- | +| Post-discharge 30 day Mortality | Discharge | 0.603 | 68,235 | \[12h,1d,full\] | \[code/count,value/sum_sqd,value/max\] | +| Post-discharge 1 year Mortality | Discharge | 0.875 | 3,280 | \[30d,365d\] | \[static/present,value/sum,value/sum_sqd,value/min,value/max\] | +| In Hospital Mortality | Admission + 24 hr | 0.855 | 335,912 | \[2h,7d,30d,365d,full\] | \[static/present,code/count,value/count,value/min,value/max\] | +| In Hospital Mortality | Admission + 48 hr | 0.570 | 89,121 | \[12h,1d,30d\] | \[code/count,value/count,value/min\] | +| LOS in ICU > 3 days | Admission + 24 hr | 0.783 | 7,881 | \[1d,30d,full\] | \[static/present,code/count,value/count,value/sum,value/max\] | +| LOS in ICU > 3 days | Admission + 48 hr | 0.757 | 1,719 | \[2h,12h,7d,30d,full\] | \[code/count,value/count,value/sum,value/sum_sqd,value/min\] | +| LOS in Hospital > 3 days | Admission + 24 hr | 0.864 | 160 | \[1d,30d,365d,full\] | \[static/present,code/count,value/min,value/max\] | +| LOS in Hospital > 3 days | Admission + 48 hr | 0.895 | 975 | \[12h,1d,30d,365d,full\] | \[code/count,value/count,value/sum,value/sum_sqd\] | + +#### 2. XGBoost Optimal Found Model Parameters + +| Task | Index Timestamp | Eta | Lambda | Alpha | Subsample | Minimum Child Weight | Number of Boosting Rounds | Early Stopping Rounds | Max Tree Depth | +| ------------------------------- | ----------------- | ----- | ------ | ----- | --------- | -------------------- | ------------------------- | --------------------- | -------------- | +| In Hospital Mortality | Admission + 24 hr | 0.043 | 0.001 | 0.343 | 0.879 | 13 | 574 | 9 | 14 | +| In Hospital Mortality | Admission + 48 hr | 0.002 | 0.002 | 0.303 | 0.725 | 0 | 939 | 9 | 12 | +| LOS in ICU > 3 days | Admission + 24 hr | 0.210 | 0.189 | 0.053 | 0.955 | 5 | 359 | 6 | 14 | +| LOS in ICU > 3 days | Admission + 48 hr | 0.340 | 0.393 | 0.004 | 0.900 | 6 | 394 | 10 | 13 | +| LOS in Hospital > 3 days | Admission + 24 hr | 0.026 | 0.238 | 0.033 | 0.940 | 46 | 909 | 5 | 11 | +| LOS in Hospital > 3 days | Admission + 48 hr | 0.100 | 0.590 | 0.015 | 0.914 | 58 | 499 | 10 | 9 | +| Post-discharge 30 day Mortality | Discharge | 0.003 | 0.0116 | 0.001 | 0.730 | 13 | 986 | 7 | 7 | +| Post-discharge 1 year Mortality | Discharge | 0.005 | 0.006 | 0.002 | 0.690 | 93 | 938 | 6 | 14 | + +#### 3. eICU Task Specific Training Cohort Size + +| Task | Index Timestamp | Number of Patients | Number of Events | +| ------------------------------- | ----------------- | ------------------ | ---------------- | +| Post-discharge 30 day Mortality | Discharge | 91,405 | 91,405 | +| Post-discharge 1 year Mortality | Discharge | 91,405 | 91,405 | +| In Hospital Mortality | Admission + 24 hr | 35,85 | 3,585 | +| In Hospital Mortality | Admission + 48 hr | 1,527 | 1,527 | +| LOS in ICU > 3 days | Admission + 24 hr | 12,672 | 14,004 | +| LOS in ICU > 3 days | Admission + 48 hr | 12,712 | 14,064 | +| LOS in Hospital > 3 days | Admission + 24 hr | 99,540 | 99,540 | +| LOS in Hospital > 3 days | Admission + 48 hr | 99,786 | 99,786 | diff --git a/docs/assets/dark_purple_meds_tab.png b/docs/assets/dark_purple_meds_tab.png new file mode 100644 index 0000000..9e890ba Binary files /dev/null and b/docs/assets/dark_purple_meds_tab.png differ diff --git a/docs/assets/light_purple_meds_tab.png b/docs/assets/light_purple_meds_tab.png new file mode 100644 index 0000000..dc0b9be Binary files /dev/null and b/docs/assets/light_purple_meds_tab.png differ diff --git a/docs/assets/main_fig.png b/docs/assets/main_fig.png new file mode 100644 index 0000000..df360aa Binary files /dev/null and b/docs/assets/main_fig.png differ diff --git a/docs/assets/pivot.png b/docs/assets/pivot.png new file mode 100644 index 0000000..8887f22 Binary files /dev/null and b/docs/assets/pivot.png differ diff --git a/docs/assets/white_meds_tab.png b/docs/assets/white_meds_tab.png new file mode 100644 index 0000000..8c488e7 Binary files /dev/null and b/docs/assets/white_meds_tab.png differ diff --git a/docs/generate.sh b/docs/generate.sh new file mode 100644 index 0000000..087fac6 --- /dev/null +++ b/docs/generate.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash +# bash generate.sh + +set -e + +rm -rf build +make html +cd build/html +python -m http.server diff --git a/docs/make.bat b/docs/make.bat index 747ffb7..319c288 100644 --- a/docs/make.bat +++ b/docs/make.bat @@ -10,6 +10,8 @@ if "%SPHINXBUILD%" == "" ( set SOURCEDIR=source set BUILDDIR=build +if "%1" == "" goto help + %SPHINXBUILD% >NUL 2>NUL if errorlevel 9009 ( echo. @@ -19,12 +21,10 @@ if errorlevel 9009 ( echo.may add the Sphinx directory to PATH. echo. echo.If you don't have Sphinx installed, grab it from - echo.https://www.sphinx-doc.org/ + echo.http://sphinx-doc.org/ exit /b 1 ) -if "%1" == "" goto help - %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% goto end @@ -32,4 +32,4 @@ goto end %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% :end -popd +popd diff --git a/docs/requirements.txt b/docs/requirements.txt index 85aba40..7f214a7 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,15 +1,14 @@ sphinx==7.1.2 sphinx-rtd-theme==1.3.0rc1 sphinx-collections -recommonmark -piccolo_theme +sphinx_immaterial sphinx_subfigure nbsphinx myst_parser pypandoc linkify-it-py -ipykernel omegaconf ipywidgets ipykernel ipython +pydata-sphinx-theme diff --git a/docs/source/_static/switcher.json b/docs/source/_static/switcher.json new file mode 100644 index 0000000..ba475ce --- /dev/null +++ b/docs/source/_static/switcher.json @@ -0,0 +1,12 @@ +[ + { + "version": "dev", + "url": "https://meds-tab.readthedocs.io/en/latest/" + }, + { + "name": "0.0.2 (stable)", + "version": "v0.0.2", + "url": "https://meds-tab.readthedocs.io/en/stable/", + "preferred": true + } + ] diff --git a/docs/source/api.rst b/docs/source/api.rst deleted file mode 100644 index 463c193..0000000 --- a/docs/source/api.rst +++ /dev/null @@ -1,8 +0,0 @@ -API -==== - -.. autosummary:: - :toctree: generated - :recursive: - - src diff --git a/docs/source/computational-performance.rst b/docs/source/computational-performance.rst deleted file mode 100644 index 68a924c..0000000 --- a/docs/source/computational-performance.rst +++ /dev/null @@ -1,7 +0,0 @@ -Computational Performance vs. Existing Pipelines -================================================ - -.. include:: ../../README.md - :parser: markdown - :start-after: Computational Performance vs. Existing Pipelines - :end-before: XGBoost Model Performance on MIMIC-IV diff --git a/docs/source/conf.py b/docs/source/conf.py index 34fb932..bac8f33 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -1,5 +1,9 @@ import os +import shutil import sys +from pathlib import Path + +import MEDS_tabular_automl # Configuration file for the Sphinx documentation builder. # @@ -10,83 +14,385 @@ # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information project = "MEDS-Tab" -copyright = "2024, Matthew McDermott, Nassim Oufattole, Teya Bergamaschi" -author = "Matthew McDermott, Nassim Oufattole, Teya Bergamaschi" -release = "0.0.1" -version = "0.0.1" +copyright = "2024, Nassim Oufattole, Matthew McDermott, Teya Bergamaschi, Aleksia Kolo, Hyewon Jeong" +author = "Nassim Oufattole, Matthew McDermott, Teya Bergamaschi, Aleksia Kolo, Hyewon Jeong" +# Define the json_url for our version switcher. + + +json_url = "https://meds-tab.readthedocs.io/en/latest/_static/switcher.json" +# Define the version we use for matching in the version switcher. +version_match = os.environ.get("READTHEDOCS_VERSION") +release = MEDS_tabular_automl.__version__ +# If READTHEDOCS_VERSION doesn't exist, we're not on RTD +# If it is an integer, we're in a PR build and the version isn't correct. +# If it's "latest" → change to "dev" (that's what we want the switcher to call it) +if not version_match or version_match.isdigit(): + # For local development, infer the version to match from the package. + if "dev" in release or "rc" in release: + version_match = "dev" + # We want to keep the relative reference if we are in dev mode + # but we want the whole url if we are effectively in a released version + json_url = "_static/switcher.json" + else: + version_match = f"v{release}" +elif version_match == "latest": + version_match = "dev" +elif version_match == "stable": + version_match = f"v{release}" + +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +language = "en" +# -- Path setup + +__location__ = Path(os.path.dirname(__file__)) +__src__ = __location__ / "../.." + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +sys.path.insert(0, str(__src__)) + + +def ensure_pandoc_installed(_): + """Source: https://stackoverflow.com/questions/62398231/building-docs-fails-due-to-missing-pandoc""" + import pypandoc + + # Download pandoc if necessary. If pandoc is already installed and on + # the PATH, the installed version will be used. Otherwise, we will + # download a copy of pandoc into docs/bin/ and add that to our PATH. + pandoc_dir = str(__location__ / "bin") + # Add dir containing pandoc binary to the PATH environment variable + if pandoc_dir not in os.environ["PATH"].split(os.pathsep): + os.environ["PATH"] += os.pathsep + pandoc_dir + + pypandoc.ensure_pandoc_installed( + targetfolder=pandoc_dir, + delete_installer=True, + ) -# -- General configuration --------------------------------------------------- -# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration -sys.path.insert(0, os.path.abspath("../..")) +# -- Run sphinx-apidoc +# This ensures we don't need to run apidoc manually. +# TODO: use https://github.com/sphinx-extensions2/sphinx-autodoc2 + +from sphinx.ext import apidoc + +output_dir = __location__ / "api" +module_dir = __src__ / "src/MEDS_tabular_automl" +if output_dir.is_dir(): + shutil.rmtree(output_dir) + +try: + cmd_line = f"--implicit-namespaces -e -f -o {output_dir} {module_dir}" + apidoc.main(cmd_line.split(" ")) +except Exception as e: # pylint: disable=broad-except + print(f"Running `sphinx-apidoc {cmd_line}` failed!\n{e}") + + +# -- General configuration + + +# -- Project information extensions = [ "sphinx.ext.duration", "sphinx.ext.doctest", "sphinx.ext.autodoc", "sphinx.ext.autosummary", "sphinx.ext.intersphinx", + "sphinx.ext.todo", + "sphinx.ext.viewcode", + "sphinx.ext.coverage", + "sphinx.ext.ifconfig", + "sphinx.ext.mathjax", "sphinx.ext.napoleon", - "sphinx_rtd_theme", - "recommonmark", - # "sphinx_immaterial" + "sphinx.ext.imgconverter", + "sphinxcontrib.collections", + "sphinx_subfigure", + "myst_parser", + "nbsphinx", + # "sphinx_immaterial", ] - -source_suffix = { - ".rst": "restructuredtext", - ".txt": "markdown", - ".md": "markdown", +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +html_theme = "pydata_sphinx_theme" +# html_sidebars = {"**": []} # ["logo-text.html", "globaltoc.html", "localtoc.html", "searchbox.html"] +html_sidebars = { + "api/*": [ + "sidebar-nav-bs", + ], + "**": [], } +nbsphinx_allow_errors = True + + +collections_dir = __location__ / "_collections" +if not collections_dir.is_dir(): + os.mkdir(collections_dir) +python_version = ".".join(map(str, sys.version_info[0:2])) intersphinx_mapping = { - "python": ("https://docs.python.org/3/", None), - "sphinx": ("https://www.sphinx-doc.org/en/master/", None), + "sphinx": ("https://www.sphinx-doc.org/en/master", None), + "python": ("https://docs.python.org/" + python_version, None), + "matplotlib": ("https://matplotlib.org", None), + "numpy": ("https://numpy.org/doc/stable", None), + "sklearn": ("https://scikit-learn.org/stable", None), + "pandas": ("https://pandas.pydata.org/docs", None), + "pandera": ("https://pandera.readthedocs.io/en/stable", None), + "scipy": ("https://docs.scipy.org/doc/scipy/reference", None), + "setuptools": ("https://setuptools.pypa.io/en/stable/", None), + "pyscaffold": ("https://pyscaffold.org/en/stable", None), + "hyperimpute": ("https://hyperimpute.readthedocs.io/en/latest/", None), + "xgbse": ("https://loft-br.github.io/xgboost-survival-embeddings/", None), + "lifelines": ("https://lifelines.readthedocs.io/en/stable/", None), + "optuna": ("https://optuna.readthedocs.io/en/stable/", None), } intersphinx_disabled_domains = ["std"] templates_path = ["_templates"] -exclude_patterns = [] -autosummary_generate = True +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] + +# Control options for included jupyter notebooks. +nb_execution_mode = "off" + + +# -- Options for HTML output +# Configure MyST-Parser +myst_enable_extensions = [ + "amsmath", + "colon_fence", + "deflist", + "dollarmath", + "html_image", + "linkify", + "replacements", + "smartquotes", + "substitution", + "tasklist", +] + +myst_update_mathjax = True + +# MyST URL schemes. +myst_url_schemes = { + "http": None, + "https": None, + "ftp": None, + "mailto": None, + "repo-code": "https://github.com/mmcdermott/MEDS_Tabular_AutoML/tree/main/{{path}}#{{fragment}}", + # "doi": "https://doi.org/{{path}}", + # "gh-issue": { + # "url": "https://github.com/executablebooks/MyST-Parser/issue/{{path}}#{{fragment}}", + # "title": "Issue #{{path}}", + # "classes": ["github"], + # }, +} + +# The suffix of source filenames. +source_suffix = [".rst", ".md"] + +# The encoding of source files. +# source_encoding = 'utf-8-sig' + +# The master toctree document. +master_doc = "index" + +# The reST default role (used for this markup: `text`) to use for all documents. +default_role = "py:obj" + +# If true, '()' will be appended to :func: etc. cross-reference text. +# add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +# add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +# show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +# https://pygments.org/styles/ pygments_style = "tango" +# A list of ignored prefixes for module index sorting. +# modindex_common_prefix = [] + +# If true, keep warnings as "system message" paragraphs in the built documents. +# keep_warnings = False + +# If this is True, todo emits a warning for each TODO entries. The default is False. +todo_emit_warnings = True -# -- Options for HTML output ------------------------------------------------- -# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output -# html_theme = "sphinx_rtd_theme" -html_theme = "piccolo_theme" -# html_theme = "sphinx_immaterial" -html_static_path = ["_static"] +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. -html_title = f"MEDS-Tab v{version} Documentation" -html_short_title = "MEDS-Tab Documentation" +html_title = f"MEDS-Tab v{release} Documentation" +html_short_title = "MEDS-Tab" # html_logo = "query-512.png" # html_favicon = "query-16.ico" -# html_sidebars = {"**": ["logo-text.html", "globaltoc.html", "localtoc.html", "searchbox.html"]} - +# Material theme options (see theme.conf for more information) html_theme_options = { - "dark_mode_code_blocks": False, - # "nav_title": "MEDS-Tab", - # "palette": {"primary": "green", "accent": "green"}, - # "repo_url": "https://github.com/mmcdermott/MEDS_Tabular_AutoML", - # "repo_name": "MEDS_Tabular_AutoML", - # # Visible levels of the global TOC; -1 means unlimited - # "globaltoc_depth": 3, - # If False, expand all TOC entries - "globaltoc_collapse": True, - # If True, show hidden TOC entries - "globaltoc_includehidden": False, + "logo": { + "text": "MEDS-TAB", + "image_light": "../assets/dark_purple_meds_tab.png", + "image_dark": "../assets/light_purple_meds_tab.png", + }, + "icon_links": [ + { + "name": "GitHub", + "url": "https://github.com/mmcdermott/MEDS_Tabular_AutoML", # required + "icon": "fa-brands fa-github", + "type": "fontawesome", + }, + { + "name": "PyPI", + "url": "https://pypi.org/project/meds-tab/", + "icon": "fa-brands fa-python", + }, + ], + "header_links_before_dropdown": 6, + "show_toc_level": 1, + "navbar_align": "left", # [left, content, right] For testing that the navbar items align properly + # "show_nav_level": 2, + "announcement": "This is a community-supported tool. If you'd like to contribute, check out our GitHub repository. Your contributions are welcome!", # noqa E501 + "show_version_warning_banner": True, + "switcher": { + "json_url": json_url, + "version_match": version_match, + }, + "navbar_center": ["version-switcher", "navbar-nav"], + "footer_start": ["copyright"], + "footer_center": ["sphinx-version"], + "use_edit_page_button": True, + # "secondary_sidebar_items": { + # "**/*": ["page-toc", "edit-this-page", "sourcelink"], + # }, + "back_to_top_button": True, +} + +html_context = { + "github_user": "mmcdermott", + "github_repo": "MEDS_Tabular_AutoML", + "github_version": "main", + "doc_path": "docs/source", } +# html_theme_options = { +# # Set the name of the project to appear in the navigation. +# "nav_title": "MEDS-TAB", +# "palette": {"primary": "purple", "accent": "purple"}, +# # { +# # "media": "(prefers-color-scheme: light)", +# # "scheme": "default", +# # "toggle": { +# # "icon": "material/toggle-switch-off-outline", +# # "name": "Switch to dark mode", +# # }, +# # }, +# # { +# # "media": "(prefers-color-scheme: dark)", +# # "scheme": "slate", +# # "toggle": { +# # "icon": "material/toggle-switch", +# # "name": "Switch to light mode", +# # }, +# # }, +# # "color_primary": "green", +# # "color_accent": "green", +# # Set the repo location to get a badge with stats +# "repo_url": "https://github.com/mmcdermott/MEDS_Tabular_AutoML", +# "repo_name": "meds-tab", +# # Visible levels of the global TOC; -1 means unlimited +# "globaltoc_depth": 3, +# # If False, expand all TOC entries +# "globaltoc_collapse": True, +# # If True, show hidden TOC entries +# "globaltoc_includehidden": False, +# } + html_show_copyright = True htmlhelp_basename = "meds-tab-doc" +# -- Options for LaTeX output +# latex_engine = "xelatex" +latex_elements = { # type: ignore + # The paper size ("letterpaper" or "a4paper"). + "papersize": "letterpaper", + # The font size ("10pt", "11pt" or "12pt"). + "pointsize": "10pt", + # Additional stuff for the LaTeX preamble. + "preamble": "\n".join( + [ + r"\usepackage{svg}", + r"\DeclareUnicodeCharacter{2501}{-}", + r"\DeclareUnicodeCharacter{2503}{|}", + r"\DeclareUnicodeCharacter{2500}{-}", + r"\DeclareUnicodeCharacter{2550}{-}", + r"\DeclareUnicodeCharacter{2517}{+}", + r"\DeclareUnicodeCharacter{2518}{+}", + r"\DeclareUnicodeCharacter{2534}{+}", + r"\DeclareUnicodeCharacter{250C}{+}", + r"\DeclareUnicodeCharacter{252C}{+}", + r"\DeclareUnicodeCharacter{2510}{+}", + r"\DeclareUnicodeCharacter{2502}{|}", + r"\DeclareUnicodeCharacter{2506}{|}", + r"\DeclareUnicodeCharacter{2561}{|}", + r"\DeclareUnicodeCharacter{256A}{|}", + r"\DeclareUnicodeCharacter{2523}{|}", + r"\DeclareUnicodeCharacter{03BC}{\ensuremath{\mu}}", + r"\DeclareUnicodeCharacter{255E}{|}", + r"\DeclareUnicodeCharacter{255F}{+}", + r"\DeclareUnicodeCharacter{254E}{|}", + r"\DeclareUnicodeCharacter{257C}{-}", + r"\DeclareUnicodeCharacter{257E}{-}", + r"\DeclareUnicodeCharacter{2559}{+}", + ] + ), +} + + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, author, documentclass [howto/manual]). +latex_documents = [ + ( + "index", + "meds_tab_documentation.tex", + "MEDS-TAB Documentation", + r"Matthew McDermott \& Nassim Oufattole \& Teya Bergamaschi", + "manual", + ) +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +# latex_logo = "" + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +# latex_use_parts = False # -- Options for EPUB output epub_show_urls = "footnote" + +print(f"loading configurations for {project} {release} ...", file=sys.stderr) + + +def setup(app): + app.connect("builder-inited", ensure_pandoc_installed) diff --git a/docs/source/generated/src.MEDS_tabular_automl.configs.rst b/docs/source/generated/src.MEDS_tabular_automl.configs.rst deleted file mode 100644 index ea0b463..0000000 --- a/docs/source/generated/src.MEDS_tabular_automl.configs.rst +++ /dev/null @@ -1,30 +0,0 @@ -src.MEDS\_tabular\_automl.configs -================================= - -.. automodule:: src.MEDS_tabular_automl.configs - - - - - - - - - - - - - - - - - - - -.. rubric:: Modules - -.. autosummary:: - :toctree: - :recursive: - - src.MEDS_tabular_automl.configs.tabularization diff --git a/docs/source/generated/src.MEDS_tabular_automl.configs.tabularization.rst b/docs/source/generated/src.MEDS_tabular_automl.configs.tabularization.rst deleted file mode 100644 index 060282a..0000000 --- a/docs/source/generated/src.MEDS_tabular_automl.configs.tabularization.rst +++ /dev/null @@ -1,4 +0,0 @@ -src.MEDS\_tabular\_automl.configs.tabularization -================================================ - -.. automodule:: src.MEDS_tabular_automl.configs.tabularization diff --git a/docs/source/generated/src.MEDS_tabular_automl.describe_codes.rst b/docs/source/generated/src.MEDS_tabular_automl.describe_codes.rst deleted file mode 100644 index cbf802b..0000000 --- a/docs/source/generated/src.MEDS_tabular_automl.describe_codes.rst +++ /dev/null @@ -1,23 +0,0 @@ -src.MEDS\_tabular\_automl.describe\_codes -========================================= - -.. automodule:: src.MEDS_tabular_automl.describe_codes - - - - - - - - .. rubric:: Functions - - .. autosummary:: - - clear_code_aggregation_suffix - compute_feature_frequencies - convert_to_df - convert_to_freq_dict - filter_parquet - filter_to_codes - get_feature_columns - get_feature_freqs diff --git a/docs/source/generated/src.MEDS_tabular_automl.file_name.rst b/docs/source/generated/src.MEDS_tabular_automl.file_name.rst deleted file mode 100644 index 3b75288..0000000 --- a/docs/source/generated/src.MEDS_tabular_automl.file_name.rst +++ /dev/null @@ -1,18 +0,0 @@ -src.MEDS\_tabular\_automl.file\_name -==================================== - -.. automodule:: src.MEDS_tabular_automl.file_name - - - - - - - - .. rubric:: Functions - - .. autosummary:: - - get_model_files - get_task_specific_path - list_subdir_files diff --git a/docs/source/generated/src.MEDS_tabular_automl.generate_static_features.rst b/docs/source/generated/src.MEDS_tabular_automl.generate_static_features.rst deleted file mode 100644 index 7656f7f..0000000 --- a/docs/source/generated/src.MEDS_tabular_automl.generate_static_features.rst +++ /dev/null @@ -1,19 +0,0 @@ -src.MEDS\_tabular\_automl.generate\_static\_features -==================================================== - -.. automodule:: src.MEDS_tabular_automl.generate_static_features - - - - - - - - .. rubric:: Functions - - .. autosummary:: - - convert_to_matrix - get_flat_static_rep - get_sparse_static_rep - summarize_static_measurements diff --git a/docs/source/generated/src.MEDS_tabular_automl.generate_summarized_reps.rst b/docs/source/generated/src.MEDS_tabular_automl.generate_summarized_reps.rst deleted file mode 100644 index b9f44d7..0000000 --- a/docs/source/generated/src.MEDS_tabular_automl.generate_summarized_reps.rst +++ /dev/null @@ -1,20 +0,0 @@ -src.MEDS\_tabular\_automl.generate\_summarized\_reps -==================================================== - -.. automodule:: src.MEDS_tabular_automl.generate_summarized_reps - - - - - - - - .. rubric:: Functions - - .. autosummary:: - - aggregate_matrix - compute_agg - generate_summary - get_rolling_window_indicies - sparse_aggregate diff --git a/docs/source/generated/src.MEDS_tabular_automl.generate_ts_features.rst b/docs/source/generated/src.MEDS_tabular_automl.generate_ts_features.rst deleted file mode 100644 index e4fdd21..0000000 --- a/docs/source/generated/src.MEDS_tabular_automl.generate_ts_features.rst +++ /dev/null @@ -1,20 +0,0 @@ -src.MEDS\_tabular\_automl.generate\_ts\_features -================================================ - -.. automodule:: src.MEDS_tabular_automl.generate_ts_features - - - - - - - - .. rubric:: Functions - - .. autosummary:: - - feature_name_to_code - get_flat_ts_rep - get_long_code_df - get_long_value_df - summarize_dynamic_measurements diff --git a/docs/source/generated/src.MEDS_tabular_automl.mapper.rst b/docs/source/generated/src.MEDS_tabular_automl.mapper.rst deleted file mode 100644 index a0a05ea..0000000 --- a/docs/source/generated/src.MEDS_tabular_automl.mapper.rst +++ /dev/null @@ -1,18 +0,0 @@ -src.MEDS\_tabular\_automl.mapper -================================ - -.. automodule:: src.MEDS_tabular_automl.mapper - - - - - - - - .. rubric:: Functions - - .. autosummary:: - - get_earliest_lock - register_lock - wrap diff --git a/docs/source/generated/src.MEDS_tabular_automl.rst b/docs/source/generated/src.MEDS_tabular_automl.rst deleted file mode 100644 index 2fd4a60..0000000 --- a/docs/source/generated/src.MEDS_tabular_automl.rst +++ /dev/null @@ -1,38 +0,0 @@ -src.MEDS\_tabular\_automl -========================= - -.. automodule:: src.MEDS_tabular_automl - - - - - - - - - - - - - - - - - - - -.. rubric:: Modules - -.. autosummary:: - :toctree: - :recursive: - - src.MEDS_tabular_automl.configs - src.MEDS_tabular_automl.describe_codes - src.MEDS_tabular_automl.file_name - src.MEDS_tabular_automl.generate_static_features - src.MEDS_tabular_automl.generate_summarized_reps - src.MEDS_tabular_automl.generate_ts_features - src.MEDS_tabular_automl.mapper - src.MEDS_tabular_automl.scripts - src.MEDS_tabular_automl.utils diff --git a/docs/source/generated/src.MEDS_tabular_automl.scripts.cache_task.rst b/docs/source/generated/src.MEDS_tabular_automl.scripts.cache_task.rst deleted file mode 100644 index 76d5c8c..0000000 --- a/docs/source/generated/src.MEDS_tabular_automl.scripts.cache_task.rst +++ /dev/null @@ -1,17 +0,0 @@ -src.MEDS\_tabular\_automl.scripts.cache\_task -============================================= - -.. automodule:: src.MEDS_tabular_automl.scripts.cache_task - - - - - - - - .. rubric:: Functions - - .. autosummary:: - - generate_row_cached_matrix - main diff --git a/docs/source/generated/src.MEDS_tabular_automl.scripts.describe_codes.rst b/docs/source/generated/src.MEDS_tabular_automl.scripts.describe_codes.rst deleted file mode 100644 index 7b604e6..0000000 --- a/docs/source/generated/src.MEDS_tabular_automl.scripts.describe_codes.rst +++ /dev/null @@ -1,16 +0,0 @@ -src.MEDS\_tabular\_automl.scripts.describe\_codes -================================================= - -.. automodule:: src.MEDS_tabular_automl.scripts.describe_codes - - - - - - - - .. rubric:: Functions - - .. autosummary:: - - main diff --git a/docs/source/generated/src.MEDS_tabular_automl.scripts.launch_xgboost.rst b/docs/source/generated/src.MEDS_tabular_automl.scripts.launch_xgboost.rst deleted file mode 100644 index 61afb61..0000000 --- a/docs/source/generated/src.MEDS_tabular_automl.scripts.launch_xgboost.rst +++ /dev/null @@ -1,27 +0,0 @@ -src.MEDS\_tabular\_automl.scripts.launch\_xgboost -================================================= - -.. automodule:: src.MEDS_tabular_automl.scripts.launch_xgboost - - - - - - - - .. rubric:: Functions - - .. autosummary:: - - main - - - - - - .. rubric:: Classes - - .. autosummary:: - - Iterator - XGBoostModel diff --git a/docs/source/generated/src.MEDS_tabular_automl.scripts.rst b/docs/source/generated/src.MEDS_tabular_automl.scripts.rst deleted file mode 100644 index 15ca299..0000000 --- a/docs/source/generated/src.MEDS_tabular_automl.scripts.rst +++ /dev/null @@ -1,34 +0,0 @@ -src.MEDS\_tabular\_automl.scripts -================================= - -.. automodule:: src.MEDS_tabular_automl.scripts - - - - - - - - - - - - - - - - - - - -.. rubric:: Modules - -.. autosummary:: - :toctree: - :recursive: - - src.MEDS_tabular_automl.scripts.cache_task - src.MEDS_tabular_automl.scripts.describe_codes - src.MEDS_tabular_automl.scripts.launch_xgboost - src.MEDS_tabular_automl.scripts.tabularize_static - src.MEDS_tabular_automl.scripts.tabularize_time_series diff --git a/docs/source/generated/src.MEDS_tabular_automl.scripts.tabularize_static.rst b/docs/source/generated/src.MEDS_tabular_automl.scripts.tabularize_static.rst deleted file mode 100644 index 61852e0..0000000 --- a/docs/source/generated/src.MEDS_tabular_automl.scripts.tabularize_static.rst +++ /dev/null @@ -1,16 +0,0 @@ -src.MEDS\_tabular\_automl.scripts.tabularize\_static -==================================================== - -.. automodule:: src.MEDS_tabular_automl.scripts.tabularize_static - - - - - - - - .. rubric:: Functions - - .. autosummary:: - - main diff --git a/docs/source/generated/src.MEDS_tabular_automl.scripts.tabularize_time_series.rst b/docs/source/generated/src.MEDS_tabular_automl.scripts.tabularize_time_series.rst deleted file mode 100644 index 066a968..0000000 --- a/docs/source/generated/src.MEDS_tabular_automl.scripts.tabularize_time_series.rst +++ /dev/null @@ -1,16 +0,0 @@ -src.MEDS\_tabular\_automl.scripts.tabularize\_time\_series -========================================================== - -.. automodule:: src.MEDS_tabular_automl.scripts.tabularize_time_series - - - - - - - - .. rubric:: Functions - - .. autosummary:: - - main diff --git a/docs/source/generated/src.MEDS_tabular_automl.utils.rst b/docs/source/generated/src.MEDS_tabular_automl.utils.rst deleted file mode 100644 index ba45d9c..0000000 --- a/docs/source/generated/src.MEDS_tabular_automl.utils.rst +++ /dev/null @@ -1,37 +0,0 @@ -src.MEDS\_tabular\_automl.utils -=============================== - -.. automodule:: src.MEDS_tabular_automl.utils - - - - - - - - .. rubric:: Functions - - .. autosummary:: - - add_static_missing_cols - array_to_sparse_matrix - get_events_df - get_feature_indices - get_feature_names - get_flat_rep_feature_cols - get_min_dtype - get_prediction_ts_cols - get_shard_prefix - get_static_col_dtype - get_static_feature_cols - get_ts_feature_cols - get_unique_time_events_df - hydra_loguru_init - load_matrix - load_meds_data - load_tqdm - parse_static_feature_column - sparse_matrix_to_array - store_config_yaml - store_matrix - write_df diff --git a/docs/source/generated/src.rst b/docs/source/generated/src.rst deleted file mode 100644 index b9f0096..0000000 --- a/docs/source/generated/src.rst +++ /dev/null @@ -1,30 +0,0 @@ -src -=== - -.. automodule:: src - - - - - - - - - - - - - - - - - - - -.. rubric:: Modules - -.. autosummary:: - :toctree: - :recursive: - - src.MEDS_tabular_automl diff --git a/docs/source/implementation.md b/docs/source/implementation.md new file mode 100644 index 0000000..aabe6a2 --- /dev/null +++ b/docs/source/implementation.md @@ -0,0 +1,105 @@ +# The MEDS-Tab Architecture + +In this section, we describe the MEDS-Tab architecture, specifically some of the pipeline choices we made to reduce memory usage and increase speed during the tabularization process and XGBoost tuning process. + +We break our method into 4 discrete parts: + +1. Describe codes (compute feature frequencies) +2. Tabularization of time-series data +3. Efficient data caching for task-specific rows +4. XGBoost training + +## 1. Describe Codes (compute feature frequencies) + +This initial stage processes a pre-shareded dataset. We expect a structure as follows where each shard contains a subset of the patients: + +```text +/PATH/TO/MEDS/DATA +│ +└─── +│ │ .parquet +│ │ .parquet +│ │ ... +│ +└─── +│ │ .parquet +│ │ .parquet +| │ ... +| +... +``` + +We then compute and store feature frequencies, crucial for determining which features are relevant for further analysis. + +**Detailed Workflow:** + +- **Data Loading and Sharding**: We iterate through shards to compute feature frequencies for each shard. +- **Frequency Aggregation**: After computing frequencies across shards, we aggregate them to get a final count of each feature across the entire dataset training dataset, which allows us to filter out infrequent features in the tabularization stage or when tuning XGBoost. + +## 2. Tabularization of Time-Series Data + +### Overview + +The tabularization stage of our pipeline, exposed via the cli commands: + +- `meds-tab-tabularize-static` for tabularizing static data +- and `meds-tab-tabularize-time-series` for tabularizing the time series data + +Static data is relatively small in the medical datasets, so we use a dense pivot operation, convert it to a sparse matrix, and then duplicate rows such that the static data will match up with the time series data rows generated in the next step. Static data is currently processed serially. + +The script for tabularizing time series data primarily transforms a raw, unstructured dataset into a structured, feature-rich dataset by utilizing a series of sophisticated data processing steps. This transformation (as depicted in the figure below) involves converting raw time series from a Polars dataframe into a sparse matrix format, aggregating events that occur at the same date for the same patient, and then applying rolling window aggregations to extract temporal features. + +![Time Series Tabularization Method](../assets/pivot.png) + +### High-Level Tabularization Algorithm + +1. **Data Loading and Categorization**: + + - The script iterates through shards of patients, and shards can be processed in parallel using hydras joblib to launch multiple processes. + +2. **Sparse Matrix Conversion**: + + - Data from the Polars dataframe is converted into a sparse matrix format, where each row represents a unique event (patient x timestamp), and each column corresponds to a MEDS code for the patient. + +3. **Rolling Window Aggregation**: + + - For each aggregation method (sum, count, min, max, etc.), events that occur on the same date for the same patient are aggregated. This reduces the amount of data we have to perform rolling windows over. + - Then we aggregate features over the specified rolling windows sizes. + +4. **Output Storage**: + + - Sparse array is converted to Coordinate List format and stored as a `.npz` file on disk. + - The file paths look as follows + +```text +/PATH/TO/MEDS/TABULAR_DATA +│ +└─── + ├─── + │ ├───code + │ │ └───count.npz + │ └───value + │ └───sum.npz + ... +``` + +## 3. Efficient Data Caching for Task-Specific Rows + +Now that we have generated tabular features for all the events in our dataset, we can cache subsets relevant for each task we wish to train a supervised model on. This step is critical for efficiently training machine learning models on task-specific data without having to load the entire dataset. + +**Detailed Workflow:** + +- **Row Selection Based on Tasks**: Only the data rows that are relevant to the specific tasks are selected and cached. This reduces the memory footprint and speeds up the training process. +- **Use of Sparse Matrices for Efficient Storage**: Sparse matrices are again employed here to store the selected data efficiently, ensuring that only non-zero data points are kept in memory, thus optimizing both storage and retrieval times. + +The file structure for the cached data mirrors that of the tabular data, also consisting of `.npz` files, where users must specify the directory that stores labels. Labels follow the same shard filestructure as the input meds data from step (1), and the label parquets need `patient_id`, `timestamp`, and `label` columns. + +## 4. XGBoost Training + +The final stage uses the processed and cached data to train an XGBoost model. This stage is optimized to handle the sparse data structures produced in earlier stages efficiently. + +**Detailed Workflow:** + +- **Iterator for Data Loading**: Custom iterators are designed to load sparse matrices efficiently into the XGBoost training process, which can handle sparse inputs natively, thus maintaining high computational efficiency. +- **Training and Validation**: The model is trained using the tabular data, with evaluation steps that include early stopping to prevent overfitting and tuning of hyperparameters based on validation performance. +- **Hyperaparameter Tuning**: We use [optuna](https://optuna.org/) to tune over XGBoost model pramters, aggregations, window sizes, and the minimimum code inclusion frequency. diff --git a/docs/source/implementation.rst b/docs/source/implementation.rst deleted file mode 100644 index d03e1a6..0000000 --- a/docs/source/implementation.rst +++ /dev/null @@ -1,7 +0,0 @@ -How does MEDS-Tab Work? -======================= - -.. include:: ../../README.md - :parser: markdown - :start-after: How does MEDS-Tab Work? - :end-before: Computational Performance vs. Existing Pipelines diff --git a/docs/source/index.md b/docs/source/index.md new file mode 100644 index 0000000..8f75fb2 --- /dev/null +++ b/docs/source/index.md @@ -0,0 +1,39 @@ +# Welcome! + +MEDS-Tab is a library designed for automated tabularization, data preparation with aggregation, and time windowing. Check out below for an overview of MEDS-Tab and how it could be useful in your workflows! + +```{toctree} +--- +glob: +maxdepth: 2 +--- +Overview +Pipeline +Memory/CPU Usage +Performance +API +``` + +______________________________________________________________________ + +## Why MEDS-Tab? + +MEDS-Tab is a comprehensive framework designed to streamline the handling, modeling, and analysis of complex medical time-series data. By leveraging automated processes, MEDS-Tab significantly reduces the computation required to generate high-quality baseline models for diverse supervised learning tasks. + +- Cost Efficiency: MEDS-Tab is dramatically more cost-effective compared to existing solutions +- Strong Performance: MEDS-Tab provides robustness and high performance across various datasets compared with other frameworks. + +### I. Transform to MEDS + +MEDS-Tab leverages the recently developed, minimal, easy-to-use Medical Event Data Standard (MEDS) schema to standardize structured EHR data to a consistent schema from which baselines can be reliably produced across arbitrary tasks and settings. In order to use MEDS-Tab, you will first need to transform your raw EHR data to a MEDS format, which can be done using the following libraries: + +- [MEDS Polars](https://github.com/mmcdermott/MEDS_polars_functions) for a set of functions and scripts for extraction to and transformation/pre-processing of MEDS-formatted data. +- [MEDS ETL](https://github.com/Medical-Event-Data-Standard/meds_etl) for a collection of ETLs from common data formats to MEDS. The package library currently supports MIMIC-IV, OMOP v5, and MEDS FLAT (a flat version of MEDS). + +### II. Run MEDS-Tab + +- Run the MEDS-Tab Command-Line Interface tool (`MEDS-Tab-cli`) to extract cohorts based on your task - check out the [Usage Guide](https://meds-tab--36.org.readthedocs.build/en/36/overview.html#core-cli-scripts-overview)! + +- Painless Reproducibility: Use [MEDS-Tab](https://github.com/mmcdermott/MEDS_TAB_MIMIC_IV/tree/main/tasks) to obtain comparable, reproducible, and well-tuned XGBoost results tailored to your dataset-specific feature space! + +By following these steps, you can seamlessly transform your dataset, define necessary criteria, and leverage powerful machine learning tools within the MEDS-Tab ecosystem. This approach not only simplifies the process but also ensures high-quality, reproducible results for your machine learning tasks for health projects. It can reliably take no more than a week of full-time human effort to perform Steps I-V on new datasets in reasonable raw formulations! diff --git a/docs/source/index.rst b/docs/source/index.rst deleted file mode 100644 index 15737d3..0000000 --- a/docs/source/index.rst +++ /dev/null @@ -1,19 +0,0 @@ -.. MEDS-TAB documentation master file, created by - sphinx-quickstart on Mon Jun 3 20:41:52 2024. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. - -Welcome to MEDS-Tab's documentation! -==================================== - -.. toctree:: - :maxdepth: 2 - :caption: Contents: - - overview - installation - usage - implementation - computational-performance - xgboost-performance - api diff --git a/docs/source/installation.rst b/docs/source/installation.rst deleted file mode 100644 index fe44a54..0000000 --- a/docs/source/installation.rst +++ /dev/null @@ -1,7 +0,0 @@ -Installation -============ - -.. include:: ../../README.md - :parser: markdown - :start-after: Installation - :end-before: Usage diff --git a/docs/source/overview.md b/docs/source/overview.md new file mode 100644 index 0000000..5132c11 --- /dev/null +++ b/docs/source/overview.md @@ -0,0 +1,161 @@ +# Usage + +This repository consists of two key pieces: + +1. Construction and efficient loading of tabular (flat, non-longitudinal) summary features describing patient records in MEDS over arbitrary time windows (e.g. 1 year, 6 months, etc.), which go backwards in time from a given index date. +2. Running a basic XGBoost AutoML pipeline over these tabular features to predict arbitrary binary classification or regression downstream tasks defined over these datasets. The "AutoML" part of this is not particularly advanced -- what is more advanced is the efficient construction, storage, and loading of tabular features for the candidate AutoML models, enabling a far more extensive search over a much larger total number of features than prior systems. + +## Quick Start + +To use MEDS-Tab, install the dependencies following commands below: + +**Pip Install** + +```console +pip install meds-tab +``` + +**Local Install** + +```console +# clone the git repo +pip install . +``` + +## Scripts and Examples + +For an end to end example over MIMIC-IV, see the [MIMIC-IV companion repository](https://github.com/mmcdermott/MEDS_TAB_MIMIC_IV). +For an end to end example over Philips eICU, see the [eICU companion repository](https://github.com/mmcdermott/MEDS_TAB_EICU). + +See [`/tests/test_integration.py`](https://github.com/mmcdermott/MEDS_Tabular_AutoML/blob/main/tests/test_integration.py) for a local example of the end-to-end pipeline being run on synthetic data. This script is a functional test that is also run with `pytest` to verify the correctness of the algorithm. + +## Core CLI Scripts Overview + +1. **`meds-tab-describe`**: This command processes MEDS data shards to compute the frequencies of different code-types. It differentiates codes into the following categories: + + - time-series codes (codes with timestamps) + - time-series numerical values (codes with timestamps and numerical values) + - static codes (codes without timestamps) + - static numerical codes (codes without timestamps but with numerical values). + + This script further caches feature names and frequencies in a dataset stored in a `code_metadata.parquet` file within the `MEDS_cohort_dir` argument specified as a hydra-style command line argument. + +2. **`meds-tab-tabularize-static`**: Filters and processes the dataset based on the frequency of codes, generating a tabular vector for each patient at each timestamp in the shards. Each row corresponds to a unique `patient_id` and `timestamp` combination, thus rows are duplicated across multiple timestamps for the same patient. + + **Example: Tabularizing static data** with the minimum code frequency of 10, window sizes of `[1d, 30d, 365d, full]`, and value aggregation methods of `[static/present, static/first, code/count, value/count, value/sum, value/sum_sqd, value/min, value/max]` + + ```console + meds-tab-tabularize-static MEDS_cohort_dir="path_to_data" \ + tabularization.min_code_inclusion_frequency=10 \ + tabularization.window_sizes=[1d,30d,365d,full] \ + do_overwrite=False \ + tabularization.aggs=[static/present,static/first,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max]" + ``` + + - For the exhuastive examples of value aggregations, see [`/src/MEDS_tabular_automl/utils.py`](https://github.com/mmcdermott/MEDS_Tabular_AutoML/blob/main/src/MEDS_tabular_automl/utils.py#L24) + +3. **`meds-tab-tabularize-time-series`**: Iterates through combinations of a shard, `window_size`, and `aggregation` to generate feature vectors that aggregate patient data for each unique `patient_id` x `timestamp`. This stage (and the previous stage) use sparse matrix formats to efficiently handle the computational and storage demands of rolling window calculations on large datasets. We support parallelization through Hydra's [`--multirun`](https://hydra.cc/docs/intro/#multirun) flag and the [`joblib` launcher](https://hydra.cc/docs/plugins/joblib_launcher/#internaldocs-banner). + + **Example: Aggregate time-series data** on features across different `window_sizes` + + ```console + meds-tab-tabularize-time-series --multirun \ + worker="range(0,$N_PARALLEL_WORKERS)" \ + hydra/launcher=joblib \ + MEDS_cohort_dir="path_to_data" \ + tabularization.min_code_inclusion_frequency=10 \ + do_overwrite=False \ + tabularization.window_sizes=[1d,30d,365d,full] \ + tabularization.aggs=[static/present,static/first,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max] + ``` + +4. **`meds-tab-cache-task`**: Aligns task-specific labels with the nearest prior event in the tabularized data. It requires a labeled dataset directory with three columns (`patient_id`, `timestamp`, `label`) structured similarly to the `MEDS_cohort_dir`. + + **Example: Align tabularized data** for a specific task `$TASK` and labels that has pulled from [ACES](https://github.com/justin13601/ACES) + + ```console + meds-tab-cache-task MEDS_cohort_dir="path_to_data" \ + task_name=$TASK \ + tabularization.min_code_inclusion_frequency=10 \ + do_overwrite=False \ + tabularization.window_sizes=[1d,30d,365d,full] \ + tabularization.aggs=[static/present,static/first,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max] + ``` + +5. **`meds-tab-xgboost`**: Trains an XGBoost model using user-specified parameters. Permutations of `window_sizes` and `aggs` can be generated using `generate-permutations` command (See the section below for descriptions). + + ```console + meds-tab-xgboost --multirun \ + MEDS_cohort_dir="path_to_data" \ + task_name=$TASK \ + output_dir="output_directory" \ + tabularization.min_code_inclusion_frequency=10 \ + tabularization.window_sizes=$(generate-permutations [1d,30d,365d,full]) \ + do_overwrite=False \ + tabularization.aggs=$(generate-permutations [static/present,static/first,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max]) + ``` + +## Additional CLI Scripts + +1. **`generate-permutations`**: Generates and prints a sorted list of all permutations from a comma separated input. This is provided for the convenience of sweeping over all possible combinations of window sizes and aggregations. + + For example you can directly call **`generate-permutations`** in the command line: + + ```console + generate-permutations [2,3,4] \ + [2], [2, 3], [2, 3, 4], [2, 4], [3], [3, 4], [4] + ``` + + This could be used in the command line in concert with other calls. For example, the following call: + + ```console + meds-tab-xgboost --multirun tabularization.window_sizes=$(generate-permutations [1d,2d,7d,full]) + ``` + + would resolve to: + + ```console + meds-tab-xgboost --multirun tabularization.window_sizes=[1d],[1d,2d],[1d,2d,7d],[1d,2d,7d,full],[1d,2d,full],[1d,7d],[1d,7d,full],[1d,full],[2d],[2d,7d],[2d,7d,full],[2d,full],[7d],[7d,full],[full] + ``` + + which can then be correctly interpreted by Hydra's multirun logic. + +## Roadmap + +MEDS-Tab has several key limitations which we plan to address in future changes. These include, and are tracked by, the following GitHub issues. + +### Improvements to the core tabularization + +1. Further memory and runtime improvements are possible: [#16](https://github.com/mmcdermott/MEDS_Tabular_AutoML/issues/16) +2. We should support additional window sizes and types: [#31](https://github.com/mmcdermott/MEDS_Tabular_AutoML/issues/31) +3. We should support additional aggregation functions: [#32](https://github.com/mmcdermott/MEDS_Tabular_AutoML/issues/32) + +### Improvements to the modeling pipeline + +1. We should likely decorrelate the default aggregations and/or window sizes we use prior to passing them into the models as features: [#27](https://github.com/mmcdermott/MEDS_Tabular_AutoML/issues/27) +2. We need to do a detailed parameter study over the hyperparameter sweep options to find good defaults for these kinds of problems and models: [#33](https://github.com/mmcdermott/MEDS_Tabular_AutoML/issues/33) +3. We should support a more extensive set of pipeline operations and model architectures: [#37](https://github.com/mmcdermott/MEDS_Tabular_AutoML/issues/37) + +### Technical debt / code improvements + +1. The computation and use of the code metadata dataframe, containing frequencies of codes, should be offloaded to core MEDS functionality, with the remaining code in this repository cleaned up. + - [#28](https://github.com/mmcdermott/MEDS_Tabular_AutoML/issues/28) + - [#14](https://github.com/mmcdermott/MEDS_Tabular_AutoML/issues/14) +2. We should add more doctests and push test coverage up to 100% + - [#29](https://github.com/mmcdermott/MEDS_Tabular_AutoML/issues/29) + - [#30](https://github.com/mmcdermott/MEDS_Tabular_AutoML/issues/30) +3. We need to ensure full and seamless compatibility with the ACES CLI tool, rather than relying on the python API and manual adjustments: + [#34](https://github.com/mmcdermott/MEDS_Tabular_AutoML/issues/34) + +## What do you mean "tabular pipelines"? Isn't _all_ structured EHR data already tabular? + +This is a common misconception. _Tabular_ data refers to data that can be organized in a consistent, logical +set of rows/columns such that the entirety of a "sample" or "instance" for modeling or analysis is contained +in a single row, and the set of columns possibly observed (there can be missingness) is consistent across all +rows. Structured EHR data does not satisfy this definition, as we will have different numbers of observations +of medical codes and values at different timestamps for different patients, so it cannot simultanesouly +satisfy the (1) "single row single instance", (2) "consistent set of columns", and (3) "logical" requirements. +Thus, in this pipeline, when we say we will produce a "tabular" view of MEDS data, we mean a dataset that can +realize these constraints, which will explicitly involve summarizing the patient data over various historical +or future windows in time to produce a single row per patient with a consistent, logical set of columns +(though there may still be missingness). diff --git a/docs/source/overview.rst b/docs/source/overview.rst deleted file mode 100644 index c19436c..0000000 --- a/docs/source/overview.rst +++ /dev/null @@ -1,6 +0,0 @@ -Overview -======== - -.. include:: ../../README.md - :parser: markdown - :end-before: Installation diff --git a/docs/source/prediction.md b/docs/source/prediction.md new file mode 100644 index 0000000..35131d8 --- /dev/null +++ b/docs/source/prediction.md @@ -0,0 +1,206 @@ +# Prediction Performance + +## XGBoost Model Performance on MIMIC-IV Tasks + +Evaluating our tabularization approach for baseline models involved training XGBoost across a spectrum of binary clinical prediction tasks, using data from the MIMIC-IV database. These tasks encompassed diverse outcomes such as mortality predictions over different intervals, readmission predictions, and lengths of stay (LOS) in both ICU and hospital settings. + +Each task is characterized by its specific label and prediction time. For instance, predicting "30-day readmission" involves assessing whether a patient returns to the hospital within 30 days, with predictions made at the time of discharge. This allows input features to be derived from the entire duration of the patient's admission. In contrast, tasks like "In ICU Mortality" focus on predicting the occurrence of death using only data from the first 24 or 48 hours of ICU admission. Specifically, we use the terminology "Index Timestamp" to mean the timestamp such that no event included as input will occur later than this point. + +We optimize predictive accuracy and model performance by using varied window sizes and aggregations of patient data. This approach allows us to effectively capture and leverage the temporal dynamics and clinical nuances inherent in each prediction task. + +### 1. XGBoost Time and Memory Profiling on MIMIC-IV + +A single XGBoost run was completed to profile time and memory usage. This was done for each `$TASK` using the following command: + +```console +meds-tab-xgboost + MEDS_cohort_dir="path_to_data" \ + task_name=$TASK \ + output_dir="output_directory" \ + do_overwrite=False \ +``` + +This uses the defaults minimum code inclusion frequency, window sizes, and aggregations from the `launch_xgboost.yaml`: + +```yaml +allowed_codes: # allows all codes that meet min code inclusion frequency +min_code_inclusion_frequency: 10 +window_sizes: + - 1d + - 7d + - 30d + - 365d + - full +aggs: + - static/present + - static/first + - code/count + - value/count + - value/sum + - value/sum_sqd + - value/min + - value/max +``` + +Since this includes every window size and aggregation, it is the most expensive to run. The runtimes and memory usage are reported below. + +#### 1.1 XGBoost Runtimes and Memory Usage on MIMIC-IV Tasks + +| Task | Index Timestamp | Real Time | User Time | Sys Time | Avg Memory (MiB) | Peak Memory (MiB) | +| ------------------------------- | ----------------- | --------- | --------- | -------- | ---------------- | ----------------- | +| Post-discharge 30 day Mortality | Discharge | 2m59s | 3m38s | 0m38s | 9,037 | 11,955 | +| Post-discharge 1 year Mortality | Discharge | 5m16s | 6m10s | 0m59s | 10,804 | 12,330 | +| 30 day Readmission | Discharge | 2m30s | 3m3s | 0m39s | 13,199 | 18,677 | +| In ICU Mortality | Admission + 24 hr | 0m38s | 1m3s | 0m13s | 1,712 | 2,986 | +| In ICU Mortality | Admission + 48 hr | 0m34s | 1m1s | 0m13s | 1,613 | 2,770 | +| In Hospital Mortality | Admission + 24 hr | 2m8s | 2m41s | 0m32s | 9,072 | 12,056 | +| In Hospital Mortality | Admission + 48 hr | 1m54s | 2m25s | 0m29s | 8,858 | 12,371 | +| LOS in ICU > 3 days | Admission + 24 hr | 2m3s | 2m37s | 0m28s | 4,650 | 5,715 | +| LOS in ICU > 3 days | Admission + 48 hr | 1m44s | 2m18s | 0m24s | 4,453 | 5,577 | +| LOS in Hospital > 3 days | Admission + 24 hr | 6m5s | 7m5s | 1m4s | 11,012 | 12,223 | +| LOS in Hospital > 3 days | Admission + 48 hr | 6m10s | 7m12s | 1m4s | 10,703 | 11,830 | + +#### 1.2 MIMIC-IV Task Specific Training Cohort Size + +To better understand the runtimes, we also report the task specific cohort size. + +| Task | Index Timestamp | Number of Patients | Number of Events | +| ------------------------------- | ----------------- | ------------------ | ---------------- | +| Post-discharge 30 day Mortality | Discharge | 149,014 | 356,398 | +| Post-discharge 1 year Mortality | Discharge | 149,014 | 356,398 | +| 30 day Readmission | Discharge | 17,418 | 377,785 | +| In ICU Mortality | Admission + 24 hr | 7,839 | 22,811 | +| In ICU Mortality | Admission + 48 hr | 6,750 | 20,802 | +| In Hospital Mortality | Admission + 24 hr | 51,340 | 338,614 | +| In Hospital Mortality | Admission + 48 hr | 47,231 | 348,289 | +| LOS in ICU > 3 days | Admission + 24 hr | 42,809 | 61,342 | +| LOS in ICU > 3 days | Admission + 48 hr | 42,805 | 61,327 | +| LOS in Hospital > 3 days | Admission + 24 hr | 152,126 | 360,208 | +| LOS in Hospital > 3 days | Admission + 48 hr | 152,120 | 359,020 | + +### 2. MIMIC-IV Sweep + +The XGBoost sweep was run using the following command for each `$TASK`: + +```console +meds-tab-xgboost --multirun \ + MEDS_cohort_dir="path_to_data" \ + task_name=$TASK \ + output_dir="output_directory" \ + tabularization.window_sizes=$(generate-permutations [1d,30d,365d,full]) \ + do_overwrite=False \ + tabularization.aggs=$(generate-permutations [static/present,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max]) +``` + +The model parameters were set to: + +```yaml +model: + booster: gbtree + device: cpu + nthread: 1 + tree_method: hist + objective: binary:logistic +``` + +The hydra sweeper swept over the parameters: + +```yaml +params: + +model_params.model.eta: tag(log, interval(0.001, 1)) + +model_params.model.lambda: tag(log, interval(0.001, 1)) + +model_params.model.alpha: tag(log, interval(0.001, 1)) + +model_params.model.subsample: interval(0.5, 1) + +model_params.model.min_child_weight: interval(1e-2, 100) + +model_params.model.max_depth: range(2, 16) + model_params.num_boost_round: range(100, 1000) + model_params.early_stopping_rounds: range(1, 10) + tabularization.min_code_inclusion_frequency: tag(log, range(10, 1000000)) +``` + +Note that the XGBoost command shown includes `tabularization.window_sizes` and ` tabularization.aggs` in the parameters to sweep over. + +For a complete example on MIMIC-IV and for all of our config files, see the [MIMIC-IV companion repository](https://github.com/mmcdermott/MEDS_TAB_MIMIC_IV). + +#### 2.1 XGBoost Performance on MIMIC-IV + +| Task | Index Timestamp | AUC | Minimum Code Inclusion Frequency | Number of Included Codes\* | Window Sizes | Aggregations | +| ------------------------------- | ----------------- | ----- | -------------------------------- | -------------------------- | ---------------------- | --------------------------------------------------------------------------- | +| Post-discharge 30 day Mortality | Discharge | 0.935 | 1,371 | 5,712 | \[7d,full\] | \[code/count,value/count,value/min,value/max\] | +| Post-discharge 1 year Mortality | Discharge | 0.898 | 289 | 10,048 | \[2h,12h,1d,30d,full\] | \[static/present,code/count,value/sum_sqd,value/min\] | +| 30 day Readmission | Discharge | 0.708 | 303 | 9,903 | \[30d,365d,full\] | \[code/count,value/count,value/sum,value/sum_sqd,value/max\] | +| In ICU Mortality | Admission + 24 hr | 0.661 | 7,059 | 3,037 | \[12h,full\] | \[static/present,code/count,value/sum,value/min,value/max\] | +| In ICU Mortality | Admission + 48 hr | 0.673 | 71 | 16,112 | \[1d,7d,full\] | \[static/present,code/count,value/sum,value/min,value/max\] | +| In Hospital Mortality | Admission + 24 hr | 0.812 | 43 | 18,989 | \[1d,full\] | \[static/present,code/count,value/sum,value/min,value/max\] | +| In Hospital Mortality | Admission + 48 hr | 0.810 | 678 | 7,433 | \[1d,full\] | \[static/present,code/count,value/count\] | +| LOS in ICU > 3 days | Admission + 24 hr | 0.946 | 30,443 | 1,624 | \[2h,7d,30d\] | \[static/present,code/count,value/count,value/sum,value/sum_sqd,value/max\] | +| LOS in ICU > 3 days | Admission + 48 hr | 0.967 | 2,864 | 4,332 | \[2h,7d,30d\] | \[code/count,value/sum_sqd,value/max\] | +| LOS in Hospital > 3 days | Admission + 24 hr | 0.943 | 94,633 | 912 | \[12h,1d,7d\] | \[code/count,value/count,value/sum_sqd\] | +| LOS in Hospital > 3 days | Admission + 48 hr | 0.945 | 30,880 | 1,619 | \[1d,7d,30d\] | \[code/count,value/sum,value/min,value/max\] | + +- Number of Included Codes is based on Minimum Code Inclusion Frequency -- we calculated the number of resulting codes that were above the minimum threshold and reported that. + +#### 2.2 XGBoost Optimal Found Model Parameters + +Additionally, the model parameters from the highest performing run are reported below. + +| Task | Index Timestamp | Eta | Lambda | Alpha | Subsample | Minimum Child Weight | Number of Boosting Rounds | Early Stopping Rounds | Max Tree Depth | +| ------------------------------- | ----------------- | ----- | ------ | ----- | --------- | -------------------- | ------------------------- | --------------------- | -------------- | +| Post-discharge 30 day Mortality | Discharge | 0.006 | 0.032 | 0.374 | 0.572 | 53 | 703 | 9 | 16 | +| Post-discharge 1 year Mortality | Discharge | 0.009 | 0.086 | 0.343 | 0.899 | 76 | 858 | 9 | 11 | +| 30 day Readmission | Discharge | 0.006 | 0.359 | 0.374 | 0.673 | 53 | 712 | 9 | 16 | +| In ICU Mortality | Admission + 24 hr | 0.038 | 0.062 | 0.231 | 0.995 | 89 | 513 | 7 | 14 | +| In ICU Mortality (first 48h) | Admission + 48 hr | 0.044 | 0.041 | 0.289 | 0.961 | 91 | 484 | 5 | 14 | +| In Hospital Mortality | Admission + 24 hr | 0.028 | 0.013 | 0.011 | 0.567 | 11 | 454 | 6 | 9 | +| In Hospital Mortality | Admission + 48 hr | 0.011 | 0.060 | 0.179 | 0.964 | 84 | 631 | 7 | 13 | +| LOS in ICU > 3 days | Admission + 24 hr | 0.012 | 0.090 | 0.137 | 0.626 | 26 | 650 | 8 | 14 | +| LOS in ICU > 3 days | Admission + 48 hr | 0.012 | 0.049 | 0.200 | 0.960 | 84 | 615 | 7 | 13 | +| LOS in Hospital > 3 days | Admission + 24 hr | 0.008 | 0.067 | 0.255 | 0.989 | 90 | 526 | 5 | 14 | +| LOS in Hospital > 3 days | Admission + 48 hr | 0.001 | 0.030 | 0.028 | 0.967 | 9 | 538 | 8 | 7 | + +## XGBoost Model Performance on eICU Tasks + +### eICU Sweep + +The eICU sweep was conducted equivalently to the MIMIC-IV sweep. Please refer to the MIMIC-IV Sweep subsection above for details on the commands and sweep parameters. + +For more details about eICU specific task generation and running, see the [eICU companion repository](https://github.com/mmcdermott/MEDS_TAB_EICU). + +#### 1. XGBoost Performance on eICU + +| Task | Index Timestamp | AUC | Minimum Code Inclusion Frequency | Window Sizes | Aggregations | +| ------------------------------- | ----------------- | ----- | -------------------------------- | ------------------------ | -------------------------------------------------------------- | +| Post-discharge 30 day Mortality | Discharge | 0.603 | 68,235 | \[12h,1d,full\] | \[code/count,value/sum_sqd,value/max\] | +| Post-discharge 1 year Mortality | Discharge | 0.875 | 3,280 | \[30d,365d\] | \[static/present,value/sum,value/sum_sqd,value/min,value/max\] | +| In Hospital Mortality | Admission + 24 hr | 0.855 | 335,912 | \[2h,7d,30d,365d,full\] | \[static/present,code/count,value/count,value/min,value/max\] | +| In Hospital Mortality | Admission + 48 hr | 0.570 | 89,121 | \[12h,1d,30d\] | \[code/count,value/count,value/min\] | +| LOS in ICU > 3 days | Admission + 24 hr | 0.783 | 7,881 | \[1d,30d,full\] | \[static/present,code/count,value/count,value/sum,value/max\] | +| LOS in ICU > 3 days | Admission + 48 hr | 0.757 | 1,719 | \[2h,12h,7d,30d,full\] | \[code/count,value/count,value/sum,value/sum_sqd,value/min\] | +| LOS in Hospital > 3 days | Admission + 24 hr | 0.864 | 160 | \[1d,30d,365d,full\] | \[static/present,code/count,value/min,value/max\] | +| LOS in Hospital > 3 days | Admission + 48 hr | 0.895 | 975 | \[12h,1d,30d,365d,full\] | \[code/count,value/count,value/sum,value/sum_sqd\] | + +#### 2. XGBoost Optimal Found Model Parameters + +| Task | Index Timestamp | Eta | Lambda | Alpha | Subsample | Minimum Child Weight | Number of Boosting Rounds | Early Stopping Rounds | Max Tree Depth | +| ------------------------------- | ----------------- | ----- | ------ | ----- | --------- | -------------------- | ------------------------- | --------------------- | -------------- | +| In Hospital Mortality | Admission + 24 hr | 0.043 | 0.001 | 0.343 | 0.879 | 13 | 574 | 9 | 14 | +| In Hospital Mortality | Admission + 48 hr | 0.002 | 0.002 | 0.303 | 0.725 | 0 | 939 | 9 | 12 | +| LOS in ICU > 3 days | Admission + 24 hr | 0.210 | 0.189 | 0.053 | 0.955 | 5 | 359 | 6 | 14 | +| LOS in ICU > 3 days | Admission + 48 hr | 0.340 | 0.393 | 0.004 | 0.900 | 6 | 394 | 10 | 13 | +| LOS in Hospital > 3 days | Admission + 24 hr | 0.026 | 0.238 | 0.033 | 0.940 | 46 | 909 | 5 | 11 | +| LOS in Hospital > 3 days | Admission + 48 hr | 0.100 | 0.590 | 0.015 | 0.914 | 58 | 499 | 10 | 9 | +| Post-discharge 30 day Mortality | Discharge | 0.003 | 0.0116 | 0.001 | 0.730 | 13 | 986 | 7 | 7 | +| Post-discharge 1 year Mortality | Discharge | 0.005 | 0.006 | 0.002 | 0.690 | 93 | 938 | 6 | 14 | + +#### 3. eICU Task Specific Training Cohort Size + +| Task | Index Timestamp | Number of Patients | Number of Events | +| ------------------------------- | ----------------- | ------------------ | ---------------- | +| Post-discharge 30 day Mortality | Discharge | 91,405 | 91,405 | +| Post-discharge 1 year Mortality | Discharge | 91,405 | 91,405 | +| In Hospital Mortality | Admission + 24 hr | 35,85 | 3,585 | +| In Hospital Mortality | Admission + 48 hr | 1,527 | 1,527 | +| LOS in ICU > 3 days | Admission + 24 hr | 12,672 | 14,004 | +| LOS in ICU > 3 days | Admission + 48 hr | 12,712 | 14,064 | +| LOS in Hospital > 3 days | Admission + 24 hr | 99,540 | 99,540 | +| LOS in Hospital > 3 days | Admission + 48 hr | 99,786 | 99,786 | diff --git a/docs/source/profiling.md b/docs/source/profiling.md new file mode 100644 index 0000000..4ae644e --- /dev/null +++ b/docs/source/profiling.md @@ -0,0 +1,84 @@ +# Computational Performance vs. Existing Pipelines + +Evaluating the computational overhead of tabularization methods is essential for assessing their efficiency and suitability for large-scale medical data processing. This section presents a comparative analysis of the computational overhead of MEDS-Tab with other systems like Catabra and TSFresh. It outlines the performance of each system in terms of wall time, memory usage, and output size, highlighting the computational efficiency and scalability of MEDS-Tab. + +## 1. System Comparison Overview + +The systems compared in this study represent different approaches to data tabularization, with the main difference being MEDS-Tab usage of sparse tabularization. Specifically, for comparison we used: + +1. **Catabra/Catabra-Mem**: Offers data processing capabilities for time-series medical data, with variations to test memory management. +2. **TSFresh**: Both known and used for extensive feature extraction capabilities. + +The benchmarking tests were conducted using the following hardware and software settings: + +- **CPU Specification**: 2 x AMD EPYC 7713 64-Core Processor +- **RAM Specification**: 1024GB, 3200MHz, DDR4 +- **Software Environment**: Ubuntu 22.04.4 LTS + +### MEDS-Tab Tabularization Technique + +Tabularization of time-series data, as depecited above, is commonly used in several past works. The only two libraries to our knowledge that provide a full tabularization pipeline are `tsfresh` and `catabra`. `catabra` also offers a slower but more memory efficient version of their method which we denote `catabra-mem`. Other libraries either provide only rolling window functionalities (`featuretools`) or just pivoting operations (`Temporai`/`Clairvoyance`, `sktime`, `AutoTS`). We provide a significantly faster and more memory efficient method. Our findings show that on the MIMIC-IV and eICU medical datasets we significantly outperform both above-mentioned methods that provide similar functionalities with MEDS-Tab. While `catabra` and `tsfresh` could not even run within a budget of 10 minutes on as low as 10 patient's data for eICU, our method scales to process hundreds of patients with low memory usage under the same time budget. We present the results below. + +## 2. Comparative Performance Analysis + +The tables below detail computational resource utilization across two datasets and various patient scales, emphasizing the better performance of MEDS-Tab in all of the scenarios. The tables are organized by dataset and number of patients. For the analysis, the full window sizes and the aggregation method code_count were used. Additionally, we use a budget of 10 minutes for running our tests given that for such small number of patients (10, 100, and 500 patients) data should be processed quickly. Note that `catabra-mem` is omitted from the tables as it never completed within the 10 minute budget. + +### eICU Dataset + +The only method that was able to tabularize eICU data was MEDS-Tab. We ran our method with both 100 and 500 patients, resulting in an increment by three times in the number of codes. MEDS-Tab gave efficient results in terms of both time and memory usage. + +a) 100 Patients + +**Table 1: 6,374 Codes, 2,065,608 Rows, Output Shape \[132,461, 6,374\]** + +| Wall Time | Avg Memory | Peak Memory | Output Size | Method | +| --------- | ---------- | ----------- | ----------- | -------- | +| 0m39s | 5,271 MB | 14,791 MB | 362 MB | meds_tab | + +b) 500 Patients + +**Table 2: 18,314 Codes, 8,737,355 Rows, Output Shape \[565,014, 18,314\]** + +| Wall Time | Avg Memory | Peak Memory | Output Size | Method | +| --------- | ---------- | ----------- | ----------- | -------- | +| 3m4s | 8,335 MB | 15,102 MB | 1,326 MB | meds_tab | + +### MIMIC-IV Dataset + +MEDS-Tab, `tsfresh`, and `catabra` were tested across three different patient scales on MIMIC-IV. + +a) 10 Patients + +This table illustrates the efficiency of MEDS-Tab in processing a small subset of patients with extremely low computational cost and high data throughput, outperforming `tsfresh` and `catabra` in terms of both time and memory efficiency. + +**Table 3: 1,504 Codes, 23,346 Rows, Output Shape \[2,127, 1,504\]** + +| Wall Time | Avg Memory | Peak Memory | Output Size | Method | +| --------- | ---------- | ----------- | ----------- | -------- | +| 0m2s | 423 MB | 943 MB | 7 MB | meds_tab | +| 1m41s | 84,159 MB | 265,877 MB | 1 MB | tsfresh | +| 0m15s | 2,537 MB | 4,781 MB | 1 MB | catabra | + +b) 100 Patients + +The performance gap was further highlighted with an increased number of patients and codes. For a moderate patient count, MEDS-Tab demonstrated superior performance with significantly lower wall times and memory usage compared to `tsfresh` and `catabra`. + +**Table 4: 4,154 Codes, 150,789 Rows, Output Shape \[15,664, 4,154\]** + +| Wall Time | Avg Memory | Peak Memory | Output Size | Method | +| --------- | ---------- | ----------- | ----------- | -------- | +| 0m5s | 718 MB | 1,167 MB | 45 MB | meds_tab | +| 5m9s | 217,477 MB | 659,735 MB | 4 MB | tsfresh | +| 3m17s | 14,319 MB | 28,342 MB | 4 MB | catabra | + +c) 500 Patients + +Scaling further to 500 patients, MEDS-Tab maintained consistent performance, reinforcing its capability to manage large datasets efficiently. Because of the set time limit of 10 minutes, we could not get results for `catabra` and `tsfresh`. In comparison, MEDS-Tab processed the data in about 15 seconds, making it at least 40 times faster for the given patient scale. + +**Table 5: 48,115 Codes, 795,368 Rows, Output Shape \[75,595, 8,115\]** + +| Wall Time | Avg Memory | Peak Memory | Output Size | Method | +| --------- | ---------- | ----------- | ----------- | -------- | +| 0m16s | 1,410 MB | 3,539 MB | 442 MB | meds_tab | + +______________________________________________________________________ diff --git a/docs/source/usage.rst b/docs/source/usage.rst deleted file mode 100644 index 684bc43..0000000 --- a/docs/source/usage.rst +++ /dev/null @@ -1,7 +0,0 @@ -Usage -====== - -.. include:: ../../README.md - :parser: markdown - :start-after: Usage - :end-before: How does MEDS-Tab Work? diff --git a/docs/source/xgboost-performance.rst b/docs/source/xgboost-performance.rst deleted file mode 100644 index 63e41b1..0000000 --- a/docs/source/xgboost-performance.rst +++ /dev/null @@ -1,6 +0,0 @@ -XGBoost Performance on MIMIC-IV and Philips eICU -================================================ - -.. include:: ../../README.md - :parser: markdown - :start-after: XGBoost Performance diff --git a/pyproject.toml b/pyproject.toml index e4d774c..1dd0be7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,7 +14,7 @@ classifiers = [ "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", ] -dependencies = ["polars", "pyarrow", "loguru", "hydra-core", "numpy", "scipy", "pandas", "tqdm", "xgboost", "scikit-learn", "hydra-optuna-sweeper", "hydra-joblib-launcher", "ml-mixins"] +dependencies = ["polars", "pyarrow", "loguru", "hydra-core", "numpy", "scipy<1.14.0", "pandas", "tqdm", "xgboost", "scikit-learn", "hydra-optuna-sweeper", "hydra-joblib-launcher", "ml-mixins"] [project.scripts] meds-tab-describe = "MEDS_tabular_automl.scripts.describe_codes:main" diff --git a/src/MEDS_tabular_automl/__init__.py b/src/MEDS_tabular_automl/__init__.py index e69de29..3b93d0b 100644 --- a/src/MEDS_tabular_automl/__init__.py +++ b/src/MEDS_tabular_automl/__init__.py @@ -0,0 +1 @@ +__version__ = "0.0.2" diff --git a/src/MEDS_tabular_automl/scripts/cache_task.py b/src/MEDS_tabular_automl/scripts/cache_task.py index b42e765..62df9b8 100644 --- a/src/MEDS_tabular_automl/scripts/cache_task.py +++ b/src/MEDS_tabular_automl/scripts/cache_task.py @@ -10,9 +10,9 @@ import scipy.sparse as sp from omegaconf import DictConfig -from MEDS_tabular_automl.file_name import list_subdir_files -from MEDS_tabular_automl.mapper import wrap as rwlock_wrap -from MEDS_tabular_automl.utils import ( +from ..file_name import list_subdir_files +from ..mapper import wrap as rwlock_wrap +from ..utils import ( CODE_AGGREGATIONS, STATIC_CODE_AGGREGATION, STATIC_VALUE_AGGREGATION, diff --git a/src/MEDS_tabular_automl/scripts/describe_codes.py b/src/MEDS_tabular_automl/scripts/describe_codes.py index 408980f..fdee111 100644 --- a/src/MEDS_tabular_automl/scripts/describe_codes.py +++ b/src/MEDS_tabular_automl/scripts/describe_codes.py @@ -10,19 +10,14 @@ from loguru import logger from omegaconf import DictConfig, OmegaConf -from MEDS_tabular_automl.describe_codes import ( +from ..describe_codes import ( compute_feature_frequencies, convert_to_df, convert_to_freq_dict, ) -from MEDS_tabular_automl.file_name import list_subdir_files -from MEDS_tabular_automl.mapper import wrap as rwlock_wrap -from MEDS_tabular_automl.utils import ( - get_shard_prefix, - hydra_loguru_init, - load_tqdm, - write_df, -) +from ..file_name import list_subdir_files +from ..mapper import wrap as rwlock_wrap +from ..utils import get_shard_prefix, hydra_loguru_init, load_tqdm, write_df config_yaml = files("MEDS_tabular_automl").joinpath("configs/describe_codes.yaml") if not config_yaml.is_file(): diff --git a/src/MEDS_tabular_automl/scripts/launch_xgboost.py b/src/MEDS_tabular_automl/scripts/launch_xgboost.py index 9089bd5..46233c2 100644 --- a/src/MEDS_tabular_automl/scripts/launch_xgboost.py +++ b/src/MEDS_tabular_automl/scripts/launch_xgboost.py @@ -13,9 +13,9 @@ from omegaconf import DictConfig, OmegaConf from sklearn.metrics import roc_auc_score -from MEDS_tabular_automl.describe_codes import get_feature_columns -from MEDS_tabular_automl.file_name import get_model_files, list_subdir_files -from MEDS_tabular_automl.utils import get_feature_indices, hydra_loguru_init +from ..describe_codes import get_feature_columns +from ..file_name import get_model_files, list_subdir_files +from ..utils import get_feature_indices, hydra_loguru_init config_yaml = files("MEDS_tabular_automl").joinpath("configs/launch_xgboost.yaml") if not config_yaml.is_file(): diff --git a/src/MEDS_tabular_automl/scripts/tabularize_static.py b/src/MEDS_tabular_automl/scripts/tabularize_static.py index d7434f0..2474442 100644 --- a/src/MEDS_tabular_automl/scripts/tabularize_static.py +++ b/src/MEDS_tabular_automl/scripts/tabularize_static.py @@ -14,16 +14,16 @@ from omegaconf import DictConfig -from MEDS_tabular_automl.describe_codes import ( +from ..describe_codes import ( convert_to_df, filter_parquet, get_feature_columns, get_feature_freqs, ) -from MEDS_tabular_automl.file_name import list_subdir_files -from MEDS_tabular_automl.generate_static_features import get_flat_static_rep -from MEDS_tabular_automl.mapper import wrap as rwlock_wrap -from MEDS_tabular_automl.utils import ( +from ..file_name import list_subdir_files +from ..generate_static_features import get_flat_static_rep +from ..mapper import wrap as rwlock_wrap +from ..utils import ( STATIC_CODE_AGGREGATION, STATIC_VALUE_AGGREGATION, filter_to_codes, diff --git a/src/MEDS_tabular_automl/scripts/tabularize_time_series.py b/src/MEDS_tabular_automl/scripts/tabularize_time_series.py index c48e59a..c6ecc98 100644 --- a/src/MEDS_tabular_automl/scripts/tabularize_time_series.py +++ b/src/MEDS_tabular_automl/scripts/tabularize_time_series.py @@ -15,12 +15,12 @@ from loguru import logger from omegaconf import DictConfig -from MEDS_tabular_automl.describe_codes import filter_parquet, get_feature_columns -from MEDS_tabular_automl.file_name import list_subdir_files -from MEDS_tabular_automl.generate_summarized_reps import generate_summary -from MEDS_tabular_automl.generate_ts_features import get_flat_ts_rep -from MEDS_tabular_automl.mapper import wrap as rwlock_wrap -from MEDS_tabular_automl.utils import ( +from ..describe_codes import filter_parquet, get_feature_columns +from ..file_name import list_subdir_files +from ..generate_summarized_reps import generate_summary +from ..generate_ts_features import get_flat_ts_rep +from ..mapper import wrap as rwlock_wrap +from ..utils import ( STATIC_CODE_AGGREGATION, STATIC_VALUE_AGGREGATION, get_shard_prefix, diff --git a/src/MEDS_tabular_automl/utils.py b/src/MEDS_tabular_automl/utils.py index 9527aea..c398a39 100644 --- a/src/MEDS_tabular_automl/utils.py +++ b/src/MEDS_tabular_automl/utils.py @@ -78,7 +78,7 @@ def filter_to_codes( return sorted(feature_freqs["code"].to_list()) -OmegaConf.register_new_resolver("filter_to_codes", filter_to_codes) +OmegaConf.register_new_resolver("filter_to_codes", filter_to_codes, replace=True) def load_tqdm(use_tqdm: bool): @@ -158,20 +158,20 @@ def get_min_dtype(array: np.ndarray) -> np.dtype: The minimal dtype that can represent the array, or the array's dtype if it is non-numeric. Examples: - >>> get_min_dtype(np.array([1, 2, 3])) - dtype('uint8') - >>> get_min_dtype(np.array([1, 2, 3, int(1e9)])) - dtype('uint32') - >>> get_min_dtype(np.array([1, 2, 3, int(1e18)])) - dtype('uint64') - >>> get_min_dtype(np.array([1, 2, 3, -128])) - dtype('int8') - >>> get_min_dtype(np.array([1.0, 2.0, 3.0])) - dtype('float32') - >>> get_min_dtype(np.array([1, 2, 3, np.nan])) - dtype('float32') - >>> get_min_dtype(np.array([1, 2, 3, "a"])) - dtype('>> get_min_dtype(np.array([1, 2, 3])) # doctest:+ELLIPSIS + dtype('...') + >>> get_min_dtype(np.array([1, 2, 3, int(1e9)])) # doctest:+ELLIPSIS + dtype('...') + >>> get_min_dtype(np.array([1, 2, 3, int(1e18)])) # doctest:+ELLIPSIS + dtype('...') + >>> get_min_dtype(np.array([1, 2, 3, -128])) # doctest:+ELLIPSIS + dtype('...') + >>> get_min_dtype(np.array([1.0, 2.0, 3.0])) # doctest:+ELLIPSIS + dtype('...') + >>> get_min_dtype(np.array([1, 2, 3, np.nan])) # doctest:+ELLIPSIS + dtype('...') + >>> get_min_dtype(np.array([1, 2, 3, "a"])) # doctest:+ELLIPSIS + dtype('...') """ if np.issubdtype(array.dtype, np.integer): return np.result_type(np.min_scalar_type(array.min()), array.max())