diff --git a/.github/workflows/code-quality-main.yaml b/.github/workflows/code-quality-main.yaml index bb2d601..66fd427 100644 --- a/.github/workflows/code-quality-main.yaml +++ b/.github/workflows/code-quality-main.yaml @@ -20,5 +20,9 @@ jobs: with: python-version: "3.11" + - name: Install packages + run: | + pip install .[dev] + - name: Run pre-commits uses: pre-commit/action@v3.0.1 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6fd8933..8426225 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,7 +3,7 @@ default_language_version: repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.4.0 + rev: v5.0.0 hooks: # list of supported hooks: https://pre-commit.com/hooks.html - id: trailing-whitespace @@ -88,6 +88,7 @@ repos: rev: 0.7.17 hooks: - id: mdformat + exclude: "^docs/.*\\.md$" args: ["--number"] additional_dependencies: - mdformat-gfm diff --git a/README.md b/README.md index d091bb3..d618ff5 100644 --- a/README.md +++ b/README.md @@ -13,42 +13,9 @@ License

-This repository provides utilities and scripts to run limited automatic tabular ML pipelines for generic MEDS -datasets. +# Welcome! -______________________________________________________________________ - -# Usage - -This repository consists of two key pieces: - -1. Construction and efficient loading of tabular (flat, non-longitudinal) summary features describing patient records in MEDS over arbitrary time windows (e.g. 1 year, 6 months, etc.), which go backward in time from a given index date. -2. Running a basic XGBoost AutoML pipeline over these tabular features to predict arbitrary binary classification or regression downstream tasks defined over these datasets. The "AutoML" part of this is not particularly advanced -- what is more advanced is the efficient construction, storage, and loading of tabular features for the candidate AutoML models, enabling a far more extensive search over a much larger total number of features than prior systems. - -## Quick Start - -To use MEDS-Tab, install the dependencies following commands below. Note that this version of MEDS-Tab is -compatible with [MEDS v0.3](https://github.com/Medical-Event-Data-Standard/meds/releases/tag/0.3.0) - -**Pip Install** - -```console -pip install meds-tab -``` - -**Local Install** - -```console -# clone the git repo -pip install . -``` - -## Scripts and Examples - -For an end to end example, including re-sharding the input via MEDS-Transforms, see -[this example script](https://gist.github.com/mmcdermott/34194e484d7b2a2f68967b9bbccfb35b) - -See [`/tests/test_integration.py`](https://github.com/mmcdermott/MEDS_Tabular_AutoML/blob/main/tests/test_integration.py) for a local example of the end-to-end pipeline (minus re-sharding) being run on synthetic data. This script is a functional test that is also run with `pytest` to verify the correctness of the algorithm. +MEDS-Tab is a library designed for automated tabularization, data preparation with aggregations and time windowing. Check out below for an overview of MEDS-Tab and how it could be useful in your workflows! ## Why MEDS-Tab? @@ -61,129 +28,48 @@ MEDS-Tab is a comprehensive framework designed to streamline the handling, model MEDS-Tab leverages the recently developed, minimal, easy-to-use Medical Event Data Standard (MEDS) schema to standardize structured EHR data to a consistent schema from which baselines can be reliably produced across arbitrary tasks and settings. In order to use MEDS-Tab, you will first need to transform your raw EHR data to a MEDS format, which can be done using the following libraries: -- [MEDS Polars](https://github.com/mmcdermott/MEDS_polars_functions) for a set of functions and scripts for extraction to and transformation/pre-processing of MEDS-formatted data. +- [MEDS Transforms](https://github.com/mmcdermott/MEDS_transforms) for a set of functions and scripts for extraction to and transformation/pre-processing of MEDS-formatted data. - [MEDS ETL](https://github.com/Medical-Event-Data-Standard/meds_etl) for a collection of ETLs from common data formats to MEDS. The package library currently supports MIMIC-IV, OMOP v5, and MEDS FLAT (a flat version of MEDS). ### II. Run MEDS-Tab -- Run the MEDS-Tab Command-Line Interface tool (`MEDS-Tab-cli`) to extract cohorts based on your task - check out the [Usage Guide](https://meds-tab--36.org.readthedocs.build/en/36/overview.html#core-cli-scripts-overview)! +- Run the MEDS-Tab Command-Line Interface tool (`MEDS-Tab-cli`) to extract cohorts based on your task - check out the [Usage Guide](https://meds-tab--36.org.readthedocs.build/en/36/overview.html#core-cli-scripts-overview) and the [MIMIC-IV tutorial](https://github.com/mmcdermott/MEDS_Tabular_AutoML/tree/main/MIMICIV_TUTORIAL)! -- Painless Reproducibility: Use [MEDS-Tab](https://github.com/mmcdermott/MEDS_TAB_MIMIC_IV/tree/main/tasks) to obtain comparable, reproducible, and well-tuned XGBoost results tailored to your dataset-specific feature space! +- Painless Reproducibility: Use [MEDS-Tab](https://github.com/mmcdermott/MEDS_Tabular_AutoML/tree/main/MIMICIV_TUTORIAL) to obtain comparable, reproducible, and well-tuned XGBoost results tailored to your dataset-specific feature space! By following these steps, you can seamlessly transform your dataset, define necessary criteria, and leverage powerful machine learning tools within the MEDS-Tab ecosystem. This approach not only simplifies the process but also ensures high-quality, reproducible results for your machine learning tasks for health projects. It can reliably take no more than a week of full-time human effort to perform Steps I-V on new datasets in reasonable raw formulations! -## Core CLI Scripts Overview - -0. First, if your data is not already sharded to the degree you want and in a manner that subdivides your - splits with the format `"$SPLIT_NAME/\d+.parquet"`, where `$SPLIT_NAME` does not contain slashes, you will - need to re-shard your data. This can be done via the - [MEDS-Transforms](https://github.com/mmcdermott/MEDS_transforms) library, which is not included in this - repository. Having data sharded by split _is a necessary step_ to ensure that the data is efficiently - processed in parallel. You can easily re-shard your input MEDS cohort in the environment into which this - package is installed with the following command: - - ```console - # Re-shard pipeline - # $MIMICIV_input_dir is the directory containing the input, MEDS v0.3 formatted MIMIC-IV data - # $MEDS_TAB_COHORT_DIR is the directory where the re-sharded MEDS dataset will be stored, and where your model - # will store cached files during processing by default. - # $N_PATIENTS_PER_SHARD is the number of patients per shard you want to use. - MEDS_transform-reshard_to_split \ - input_dir="$MIMICIV_input_dir" \ - cohort_dir="$MEDS_TAB_COHORT_DIR" \ - 'stages=["reshard_to_split"]' \ - stage="reshard_to_split" \ - stage_configs.reshard_to_split.n_patients_per_shard=$N_PATIENTS_PER_SHARD - ``` - -1. **`meds-tab-describe`**: This command processes MEDS data shards to compute the frequencies of different code types. It differentiates codes into the following categories: - - - time-series codes (codes with timestamps) - - time-series numerical values (codes with timestamps and numerical values) - - static codes (codes without timestamps) - - static numerical codes (codes without timestamps but with numerical values). - - This script further caches feature names and frequencies in a dataset stored in a `code_metadata.parquet` file within the `input_dir` argument specified as a hydra-style command line argument. - -2. **`meds-tab-tabularize-static`**: Filters and processes the dataset based on the frequency of codes, generating a tabular vector for each patient at each timestamp in the shards. Each row corresponds to a unique `subject_id` and `timestamp` combination, thus rows are duplicated across multiple timestamps for the same patient. - - **Example: Tabularizing static data** with the minimum code frequency of 10, window sizes of `[1d, 30d, 365d, full]`, and value aggregation methods of `[static/present, static/first, code/count, value/count, value/sum, value/sum_sqd, value/min, value/max]` - - ```console - meds-tab-tabularize-static input_dir="path_to_data" \ - tabularization.min_code_inclusion_frequency=10 \ - tabularization.window_sizes=[1d,30d,365d,full] \ - do_overwrite=False \ - tabularization.aggs=[static/present,static/first,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max]" - ``` - - - For the exhaustive examples of value aggregations, see [`/src/MEDS_tabular_automl/utils.py`](https://github.com/mmcdermott/MEDS_Tabular_AutoML/blob/main/src/MEDS_tabular_automl/utils.py#L24) - -3. **`meds-tab-tabularize-time-series`**: Iterates through combinations of a shard, `window_size`, and `aggregation` to generate feature vectors that aggregate patient data for each unique `subject_id` x `timestamp`. This stage (and the previous stage) uses sparse matrix formats to efficiently handle the computational and storage demands of rolling window calculations on large datasets. We support parallelization through Hydra's [`--multirun`](https://hydra.cc/docs/intro/#multirun) flag and the [`joblib` launcher](https://hydra.cc/docs/plugins/joblib_launcher/#internaldocs-banner). - - **Example: Aggregate time-series data** on features across different `window_sizes` - - ```console - meds-tab-tabularize-time-series --multirun \ - worker="range(0,$N_PARALLEL_WORKERS)" \ - hydra/launcher=joblib \ - input_dir="path_to_data" \ - tabularization.min_code_inclusion_frequency=10 \ - do_overwrite=False \ - tabularization.window_sizes=[1d,30d,365d,full] \ - tabularization.aggs=[static/present,static/first,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max] - ``` +______________________________________________________________________ -4. **`meds-tab-cache-task`**: Aligns task-specific labels with the nearest prior event in the tabularized data. It requires a labeled dataset directory with three columns (`subject_id`, `timestamp`, `label`) structured similarly to the `input_dir`. +# Usage - **Example: Align tabularized data** for a specific task `$TASK` and labels that have been pulled from [ACES](https://github.com/justin13601/ACES) +This repository consists of two key pieces: - ```console - meds-tab-cache-task input_dir="path_to_data" \ - task_name=$TASK \ - tabularization.min_code_inclusion_frequency=10 \ - do_overwrite=False \ - tabularization.window_sizes=[1d,30d,365d,full] \ - tabularization.aggs=[static/present,static/first,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max] - ``` +1. Construction and efficient loading of tabular (flat, non-longitudinal) summary features describing patient records in MEDS over arbitrary time windows (e.g. 1 year, 6 months, etc.), which go backward in time from a given index date. +2. Running a basic XGBoost AutoML pipeline over these tabular features to predict arbitrary binary classification or regression downstream tasks defined over these datasets. The "AutoML" part of this is not particularly advanced -- what is more advanced is the efficient construction, storage, and loading of tabular features for the candidate AutoML models, enabling a far more extensive search over a much larger total number of features than prior systems. -5. **`meds-tab-xgboost`**: Trains an XGBoost model using user-specified parameters. Permutations of `window_sizes` and `aggs` can be generated using `generate-subsets` command (See the section below for descriptions). +## Quick Start - ```console - meds-tab-xgboost --multirun \ - input_dir="path_to_data" \ - task_name=$TASK \ - output_dir="output_directory" \ - tabularization.min_code_inclusion_frequency=10 \ - tabularization.window_sizes=$(generate-subsets [1d,30d,365d,full]) \ - do_overwrite=False \ - tabularization.aggs=$(generate-subsets [static/present,static/first,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max]) - ``` +To use MEDS-Tab, install the dependencies following commands below: -## Additional CLI Scripts +**Pip Install** -1. **`generate-subsets`**: Generates and prints a sorted list of all non-empty subsets from a comma-separated input. This is provided for the convenience of sweeping over all possible combinations of window sizes and aggregations. +```console +pip install meds-tab +``` - For example, you can directly call **`generate-subsets`** in the command line: +**Local Install** - ```console - generate-subsets [2,3,4] \ - [2], [2, 3], [2, 3, 4], [2, 4], [3], [3, 4], [4] - ``` +```console +# clone the git repo +pip install . +``` - This could be used in the command line in concert with other calls. For example, the following call: - - ```console - meds-tab-xgboost --multirun tabularization.window_sizes=$(generate-subsets [1d,2d,7d,full]) - ``` - - would resolve to: +## Scripts and Examples - ```console - meds-tab-xgboost --multirun tabularization.window_sizes=[1d],[1d,2d],[1d,2d,7d],[1d,2d,7d,full],[1d,2d,full],[1d,7d],[1d,7d,full],[1d,full],[2d],[2d,7d],[2d,7d,full],[2d,full],[7d],[7d,full],[full] - ``` +For an end-to-end example over MIMIC-IV, see the [MIMIC-IV tutorial](https://github.com/mmcdermott/MEDS_Tabular_AutoML/tree/main/MIMICIV_TUTORIAL). - which can then be correctly interpreted by Hydra's multirun logic. +See [`/tests/test_integration.py`](https://github.com/mmcdermott/MEDS_Tabular_AutoML/blob/main/tests/test_integration.py) for a local example of the end-to-end pipeline being run on synthetic data. This script is a functional test that is also run with `pytest` to verify the correctness of the algorithm. ## Roadmap @@ -205,12 +91,9 @@ MEDS-Tab has several key limitations which we plan to address in future changes. 1. The computation and use of the code metadata dataframe, containing frequencies of codes, should be offloaded to core MEDS functionality, with the remaining code in this repository cleaned up. - [#28](https://github.com/mmcdermott/MEDS_Tabular_AutoML/issues/28) - - [#14](https://github.com/mmcdermott/MEDS_Tabular_AutoML/issues/14) 2. We should add more doctests and push test coverage up to 100% - [#29](https://github.com/mmcdermott/MEDS_Tabular_AutoML/issues/29) - [#30](https://github.com/mmcdermott/MEDS_Tabular_AutoML/issues/30) -3. We need to ensure full and seamless compatibility with the ACES CLI tool, rather than relying on the python API and manual adjustments: - [#34](https://github.com/mmcdermott/MEDS_Tabular_AutoML/issues/34) ## What do you mean "tabular pipelines"? Isn't _all_ structured EHR data already tabular? @@ -224,405 +107,3 @@ Thus, in this pipeline, when we say we will produce a "tabular" view of MEDS dat realize these constraints, which will explicitly involve summarizing the patient data over various historical or future windows in time to produce a single row per patient with a consistent, logical set of columns (though there may still be missingness). - -______________________________________________________________________ - -# The MEDS-Tab Architecture - -In this section, we describe the MEDS-Tab architecture, specifically some of the pipeline choices we made to reduce memory usage and increase speed during the tabularization process and XGBoost tuning process. - -We break our method into 4 discrete parts: - -1. Describe codes (compute feature frequencies) -2. Tabularization of time-series data -3. Efficient data caching for task-specific rows -4. XGBoost training - -## 1. Describe Codes (compute feature frequencies) - -This initial stage processes a pre-shareded dataset. We expect a structure as follows where each shard contains a subset of the patients: - -```text -/PATH/TO/MEDS/DATA -│ -└─── -│ │ .parquet -│ │ .parquet -│ │ ... -│ -└─── -│ │ .parquet -│ │ .parquet -| │ ... -| -... -``` - -We then compute and store feature frequencies, crucial for determining which features are relevant for further analysis. - -**Detailed Workflow:** - -- **Data Loading and Sharding**: We iterate through shards to compute feature frequencies for each shard. -- **Frequency Aggregation**: After computing frequencies across shards, we aggregate them to get a final count of each feature across the entire dataset training dataset, which allows us to filter out infrequent features in the tabularization stage or when tuning XGBoost. - -## 2. Tabularization of Time-Series Data - -### Overview - -The tabularization stage of our pipeline, exposed via the cli commands: - -- `meds-tab-tabularize-static` for tabularizing static data -- and `meds-tab-tabularize-time-series` for tabularizing the time series data - -Static data is relatively small in the medical datasets, so we use a dense pivot operation, convert it to a sparse matrix, and then duplicate rows such that the static data will match up with the time series data rows generated in the next step. Static data is currently processed serially. - -The script for tabularizing time series data primarily transforms a raw, unstructured dataset into a structured, feature-rich dataset by utilizing a series of sophisticated data processing steps. This transformation (as depicted in the figure below) involves converting raw time series from a Polars dataframe into a sparse matrix format, aggregating events that occur at the same date for the same patient, and then applying rolling window aggregations to extract temporal features. - -![Time Series Tabularization Method](docs/assets/pivot.png) - -### High-Level Tabularization Algorithm - -1. **Data Loading and Categorization**: - - - The script iterates through shards of patients, and shards can be processed in parallel using hydras joblib to launch multiple processes. - -2. **Sparse Matrix Conversion**: - - - Data from the Polars dataframe is converted into a sparse matrix format, where each row represents a unique event (patient x timestamp), and each column corresponds to a MEDS code for the patient. - -3. **Rolling Window Aggregation**: - - - For each aggregation method (sum, count, min, max, etc.), events that occur on the same date for the same patient are aggregated. This reduces the amount of data we have to perform rolling windows over. - - Then we aggregate features over the specified rolling windows sizes. - -4. **Output Storage**: - - - Sparse array is converted to Coordinate List format and stored as a `.npz` file on disk. - - The file paths look as follows - -```text -/PATH/TO/MEDS/TABULAR_DATA -│ -└─── - ├─── - │ ├───code - │ │ └───count.npz - │ └───value - │ └───sum.npz - ... -``` - -## 3. Efficient Data Caching for Task-Specific Rows - -Now that we have generated tabular features for all the events in our dataset, we can cache subsets relevant for each task we wish to train a supervised model on. This step is critical for efficiently training machine learning models on task-specific data without having to load the entire dataset. - -**Detailed Workflow:** - -- **Row Selection Based on Tasks**: Only the data rows that are relevant to the specific tasks are selected and cached. This reduces the memory footprint and speeds up the training process. -- **Use of Sparse Matrices for Efficient Storage**: Sparse matrices are again employed here to store the selected data efficiently, ensuring that only non-zero data points are kept in memory, thus optimizing both storage and retrieval times. - -The file structure for the cached data mirrors that of the tabular data, also consisting of `.npz` files, where users must specify the directory that stores labels. Labels follow the same shard file structure as the input meds data from step (1), and the label parquets need `subject_id`, `timestamp`, and `label` columns. - -## 4. XGBoost Training - -The final stage uses the processed and cached data to train an XGBoost model. This stage is optimized to handle the sparse data structures produced in earlier stages efficiently. - -**Detailed Workflow:** - -- **Iterator for Data Loading**: Custom iterators are designed to load sparse matrices efficiently into the XGBoost training process, which can handle sparse inputs natively, thus maintaining high computational efficiency. -- **Training and Validation**: The model is trained using the tabular data, with evaluation steps that include early stopping to prevent overfitting and tuning of hyperparameters based on validation performance. -- **Hyperparameter Tuning**: We use [optuna](https://optuna.org/) to tune over XGBoost model parameters, aggregations, window sizes, and the minimum code inclusion frequency. - -______________________________________________________________________ - -# Computational Performance vs. Existing Pipelines - -Evaluating the computational overhead of tabularization methods is essential for assessing their efficiency and suitability for large-scale medical data processing. This section presents a comparative analysis of the computational overhead of MEDS-Tab with other systems like Catabra and TSFresh. It outlines the performance of each system in terms of wall time, memory usage, and output size, highlighting the computational efficiency and scalability of MEDS-Tab. - -## 1. System Comparison Overview - -The systems compared in this study represent different approaches to data tabularization, with the main difference being MEDS-Tab usage of sparse tabularization. Specifically, for comparison we used: - -1. **Catabra/Catabra-Mem**: Offers data processing capabilities for time-series medical data, with variations to test memory management. -2. **TSFresh**: Both known and used for extensive feature extraction capabilities. - -The benchmarking tests were conducted using the following hardware and software settings: - -- **CPU Specification**: 2 x AMD EPYC 7713 64-Core Processor -- **RAM Specification**: 1024GB, 3200MHz, DDR4 -- **Software Environment**: Ubuntu 22.04.4 LTS - -### MEDS-Tab Tabularization Technique - -Tabularization of time-series data, as depicted above, is commonly used in several past works. The only two libraries to our knowledge that provide a full tabularization pipeline are `tsfresh` and `catabra`. `catabra` also offers a slower but more memory-efficient version of their method which we denote `catabra-mem`. Other libraries either provide only rolling window functionalities (`featuretools`) or just pivoting operations (`Temporai`/`Clairvoyance`, `sktime`, `AutoTS`). We provide a significantly faster and more memory-efficient method. Our findings show that on the MIMIC-IV and eICU medical datasets, we significantly outperform both above-mentioned methods that provide similar functionalities with MEDS-Tab. While `catabra` and `tsfresh` could not even run within a budget of 10 minutes on as low as 10 patients' data for eICU, our method scales to process hundreds of patients with low memory usage under the same time budget. We present the results below. - -## 2. Comparative Performance Analysis - -The tables below detail computational resource utilization across two datasets and various patient scales, emphasizing the better performance of MEDS-Tab in all of the scenarios. The tables are organized by dataset and number of patients. For the analysis, the full window sizes and the aggregation method code_count were used. Additionally, we use a budget of 10 minutes for running our tests given that for such a small number of patients (10, 100, and 500 patients) data should be processed quickly. Note that `catabra-mem` is omitted from the tables as it was never completed within the 10-minute budget. - -### eICU Dataset - -The only method that was able to tabularize eICU data was MEDS-Tab. We ran our method with both 100 and 500 patients, resulting in an increment of three times in the number of codes. MEDS-Tab gave efficient results in terms of both time and memory usage. - -a) 100 Patients - -**Table 1: 6,374 Codes, 2,065,608 Rows, Output Shape \[132,461, 6,374\]** - -| Wall Time | Avg Memory | Peak Memory | Output Size | Method | -| --------- | ---------- | ----------- | ----------- | -------- | -| 0m39s | 5,271 MB | 14,791 MB | 362 MB | meds_tab | - -b) 500 Patients - -**Table 2: 18,314 Codes, 8,737,355 Rows, Output Shape \[565,014, 18,314\]** - -| Wall Time | Avg Memory | Peak Memory | Output Size | Method | -| --------- | ---------- | ----------- | ----------- | -------- | -| 3m4s | 8,335 MB | 15,102 MB | 1,326 MB | meds_tab | - -### MIMIC-IV Dataset - -MEDS-Tab, `tsfresh`, and `catabra` were tested across three different patient scales on MIMIC-IV. - -a) 10 Patients - -This table illustrates the efficiency of MEDS-Tab in processing a small subset of patients with extremely low computational cost and high data throughput, outperforming `tsfresh` and `catabra` in terms of both time and memory efficiency. - -**Table 3: 1,504 Codes, 23,346 Rows, Output Shape \[2,127, 1,504\]** - -| Wall Time | Avg Memory | Peak Memory | Output Size | Method | -| --------- | ---------- | ----------- | ----------- | -------- | -| 0m2s | 423 MB | 943 MB | 7 MB | meds_tab | -| 1m41s | 84,159 MB | 265,877 MB | 1 MB | tsfresh | -| 0m15s | 2,537 MB | 4,781 MB | 1 MB | catabra | - -b) 100 Patients - -The performance gap was further highlighted with an increased number of patients and codes. For a moderate patient count, MEDS-Tab demonstrated superior performance with significantly lower wall times and memory usage compared to `tsfresh` and `catabra`. - -**Table 4: 4,154 Codes, 150,789 Rows, Output Shape \[15,664, 4,154\]** - -| Wall Time | Avg Memory | Peak Memory | Output Size | Method | -| --------- | ---------- | ----------- | ----------- | -------- | -| 0m5s | 718 MB | 1,167 MB | 45 MB | meds_tab | -| 5m9s | 217,477 MB | 659,735 MB | 4 MB | tsfresh | -| 3m17s | 14,319 MB | 28,342 MB | 4 MB | catabra | - -c) 500 Patients - -Scaling further to 500 patients, MEDS-Tab maintained consistent performance, reinforcing its capability to manage large datasets efficiently. Because of the set time limit of 10 minutes, we could not get results for `catabra` and `tsfresh`. In comparison, MEDS-Tab processed the data in about 15 seconds, making it at least 40 times faster for the given patient scale. - -**Table 5: 48,115 Codes, 795,368 Rows, Output Shape \[75,595, 8,115\]** - -| Wall Time | Avg Memory | Peak Memory | Output Size | Method | -| --------- | ---------- | ----------- | ----------- | -------- | -| 0m16s | 1,410 MB | 3,539 MB | 442 MB | meds_tab | - -______________________________________________________________________ - -# Prediction Performance - -## XGBoost Model Performance on MIMIC-IV Tasks - -Evaluating our tabularization approach for baseline models involved training XGBoost across a spectrum of binary clinical prediction tasks, using data from the MIMIC-IV database. These tasks encompassed diverse outcomes such as mortality predictions over different intervals, readmission predictions, and lengths of stay (LOS) in both ICU and hospital settings. - -Each task is characterized by its specific label and prediction time. For instance, predicting "30-day readmission" involves assessing whether a patient returns to the hospital within 30 days, with predictions made at the time of discharge. This allows input features to be derived from the entire duration of the patient's admission. In contrast, tasks like "In ICU Mortality" focus on predicting the occurrence of death using only data from the first 24 or 48 hours of ICU admission. Specifically, we use the terminology "Index Timestamp" to mean the timestamp such that no event included as input will occur later than this point. - -We optimize predictive accuracy and model performance by using varied window sizes and aggregations of patient data. This approach allows us to effectively capture and leverage the temporal dynamics and clinical nuances inherent in each prediction task. - -### 1. XGBoost Time and Memory Profiling on MIMIC-IV - -A single XGBoost run was completed to profile time and memory usage. This was done for each `$TASK` using the following command: - -```console -meds-tab-xgboost - input_dir="path_to_data" \ - task_name=$TASK \ - output_dir="output_directory" \ - do_overwrite=False \ -``` - -This uses the default minimum code inclusion frequency, window sizes, and aggregations from the `launch_xgboost.yaml`: - -```yaml -allowed_codes: # allows all codes that meet min code inclusion frequency -min_code_inclusion_frequency: 10 -window_sizes: - - 1d - - 7d - - 30d - - 365d - - full -aggs: - - static/present - - static/first - - code/count - - value/count - - value/sum - - value/sum_sqd - - value/min - - value/max -``` - -Since this includes every window size and aggregation, it is the most expensive to run. The runtimes and memory usage are reported below. - -#### 1.1 XGBoost Runtimes and Memory Usage on MIMIC-IV Tasks - -| Task | Index Timestamp | Real Time | User Time | Sys Time | Avg Memory (MiB) | Peak Memory (MiB) | -| ------------------------------- | ----------------- | --------- | --------- | -------- | ---------------- | ----------------- | -| Post-discharge 30 day Mortality | Discharge | 2m59s | 3m38s | 0m38s | 9,037 | 11,955 | -| Post-discharge 1 year Mortality | Discharge | 5m16s | 6m10s | 0m59s | 10,804 | 12,330 | -| 30 day Readmission | Discharge | 2m30s | 3m3s | 0m39s | 13,199 | 18,677 | -| In ICU Mortality | Admission + 24 hr | 0m38s | 1m3s | 0m13s | 1,712 | 2,986 | -| In ICU Mortality | Admission + 48 hr | 0m34s | 1m1s | 0m13s | 1,613 | 2,770 | -| In Hospital Mortality | Admission + 24 hr | 2m8s | 2m41s | 0m32s | 9,072 | 12,056 | -| In Hospital Mortality | Admission + 48 hr | 1m54s | 2m25s | 0m29s | 8,858 | 12,371 | -| LOS in ICU > 3 days | Admission + 24 hr | 2m3s | 2m37s | 0m28s | 4,650 | 5,715 | -| LOS in ICU > 3 days | Admission + 48 hr | 1m44s | 2m18s | 0m24s | 4,453 | 5,577 | -| LOS in Hospital > 3 days | Admission + 24 hr | 6m5s | 7m5s | 1m4s | 11,012 | 12,223 | -| LOS in Hospital > 3 days | Admission + 48 hr | 6m10s | 7m12s | 1m4s | 10,703 | 11,830 | - -#### 1.2 MIMIC-IV Task Specific Training Cohort Size - -To better understand the runtimes, we also report the task specific cohort size. - -| Task | Index Timestamp | Number of Patients | Number of Events | -| ------------------------------- | ----------------- | ------------------ | ---------------- | -| Post-discharge 30 day Mortality | Discharge | 149,014 | 356,398 | -| Post-discharge 1 year Mortality | Discharge | 149,014 | 356,398 | -| 30 day Readmission | Discharge | 17,418 | 377,785 | -| In ICU Mortality | Admission + 24 hr | 7,839 | 22,811 | -| In ICU Mortality | Admission + 48 hr | 6,750 | 20,802 | -| In Hospital Mortality | Admission + 24 hr | 51,340 | 338,614 | -| In Hospital Mortality | Admission + 48 hr | 47,231 | 348,289 | -| LOS in ICU > 3 days | Admission + 24 hr | 42,809 | 61,342 | -| LOS in ICU > 3 days | Admission + 48 hr | 42,805 | 61,327 | -| LOS in Hospital > 3 days | Admission + 24 hr | 152,126 | 360,208 | -| LOS in Hospital > 3 days | Admission + 48 hr | 152,120 | 359,020 | - -### 2. MIMIC-IV Sweep - -The XGBoost sweep was run using the following command for each `$TASK`: - -```console -meds-tab-xgboost --multirun \ - input_dir="path_to_data" \ - task_name=$TASK \ - output_dir="output_directory" \ - tabularization.window_sizes=$(generate-subsets [1d,30d,365d,full]) \ - do_overwrite=False \ - tabularization.aggs=$(generate-subsets [static/present,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max]) -``` - -The model parameters were set to: - -```yaml -model: - booster: gbtree - device: cpu - nthread: 1 - tree_method: hist - objective: binary:logistic -``` - -The hydra sweeper swept over the parameters: - -```yaml -params: - model.eta: tag(log, interval(0.001, 1)) - model.lambda: tag(log, interval(0.001, 1)) - model.alpha: tag(log, interval(0.001, 1)) - model.subsample: interval(0.5, 1) - model.min_child_weight: interval(1e-2, 100) - model.max_depth: range(2, 16) - num_boost_round: range(100, 1000) - early_stopping_rounds: range(1, 10) - tabularization.min_code_inclusion_frequency: tag(log, range(10, 1000000)) -``` - -Note that the XGBoost command shown includes `tabularization.window_sizes` and ` tabularization.aggs` in the parameters to sweep over. - -For a complete example on MIMIC-IV and for all of our config files, see the [MIMIC-IV companion repository](https://github.com/mmcdermott/MEDS_TAB_MIMIC_IV). - -#### 2.1 XGBoost Performance on MIMIC-IV - -| Task | Index Timestamp | AUC | Minimum Code Inclusion Frequency | Number of Included Codes\* | Window Sizes | Aggregations | -| ------------------------------- | ----------------- | ----- | -------------------------------- | -------------------------- | ---------------------- | --------------------------------------------------------------------------- | -| Post-discharge 30 day Mortality | Discharge | 0.935 | 1,371 | 5,712 | \[7d,full\] | \[code/count,value/count,value/min,value/max\] | -| Post-discharge 1 year Mortality | Discharge | 0.898 | 289 | 10,048 | \[2h,12h,1d,30d,full\] | \[static/present,code/count,value/sum_sqd,value/min\] | -| 30 day Readmission | Discharge | 0.708 | 303 | 9,903 | \[30d,365d,full\] | \[code/count,value/count,value/sum,value/sum_sqd,value/max\] | -| In ICU Mortality | Admission + 24 hr | 0.661 | 7,059 | 3,037 | \[12h,full\] | \[static/present,code/count,value/sum,value/min,value/max\] | -| In ICU Mortality | Admission + 48 hr | 0.673 | 71 | 16,112 | \[1d,7d,full\] | \[static/present,code/count,value/sum,value/min,value/max\] | -| In Hospital Mortality | Admission + 24 hr | 0.812 | 43 | 18,989 | \[1d,full\] | \[static/present,code/count,value/sum,value/min,value/max\] | -| In Hospital Mortality | Admission + 48 hr | 0.810 | 678 | 7,433 | \[1d,full\] | \[static/present,code/count,value/count\] | -| LOS in ICU > 3 days | Admission + 24 hr | 0.946 | 30,443 | 1,624 | \[2h,7d,30d\] | \[static/present,code/count,value/count,value/sum,value/sum_sqd,value/max\] | -| LOS in ICU > 3 days | Admission + 48 hr | 0.967 | 2,864 | 4,332 | \[2h,7d,30d\] | \[code/count,value/sum_sqd,value/max\] | -| LOS in Hospital > 3 days | Admission + 24 hr | 0.943 | 94,633 | 912 | \[12h,1d,7d\] | \[code/count,value/count,value/sum_sqd\] | -| LOS in Hospital > 3 days | Admission + 48 hr | 0.945 | 30,880 | 1,619 | \[1d,7d,30d\] | \[code/count,value/sum,value/min,value/max\] | - -- Number of Included Codes is based on Minimum Code Inclusion Frequency -- we calculated the number of resulting codes that were above the minimum threshold and reported that. - -#### 2.2 XGBoost Optimal Found Model Parameters - -Additionally, the model parameters from the highest-performing run are reported below. - -| Task | Index Timestamp | Eta | Lambda | Alpha | Subsample | Minimum Child Weight | Number of Boosting Rounds | Early Stopping Rounds | Max Tree Depth | -| ------------------------------- | ----------------- | ----- | ------ | ----- | --------- | -------------------- | ------------------------- | --------------------- | -------------- | -| Post-discharge 30 day Mortality | Discharge | 0.006 | 0.032 | 0.374 | 0.572 | 53 | 703 | 9 | 16 | -| Post-discharge 1 year Mortality | Discharge | 0.009 | 0.086 | 0.343 | 0.899 | 76 | 858 | 9 | 11 | -| 30 day Readmission | Discharge | 0.006 | 0.359 | 0.374 | 0.673 | 53 | 712 | 9 | 16 | -| In ICU Mortality | Admission + 24 hr | 0.038 | 0.062 | 0.231 | 0.995 | 89 | 513 | 7 | 14 | -| In ICU Mortality (first 48h) | Admission + 48 hr | 0.044 | 0.041 | 0.289 | 0.961 | 91 | 484 | 5 | 14 | -| In Hospital Mortality | Admission + 24 hr | 0.028 | 0.013 | 0.011 | 0.567 | 11 | 454 | 6 | 9 | -| In Hospital Mortality | Admission + 48 hr | 0.011 | 0.060 | 0.179 | 0.964 | 84 | 631 | 7 | 13 | -| LOS in ICU > 3 days | Admission + 24 hr | 0.012 | 0.090 | 0.137 | 0.626 | 26 | 650 | 8 | 14 | -| LOS in ICU > 3 days | Admission + 48 hr | 0.012 | 0.049 | 0.200 | 0.960 | 84 | 615 | 7 | 13 | -| LOS in Hospital > 3 days | Admission + 24 hr | 0.008 | 0.067 | 0.255 | 0.989 | 90 | 526 | 5 | 14 | -| LOS in Hospital > 3 days | Admission + 48 hr | 0.001 | 0.030 | 0.028 | 0.967 | 9 | 538 | 8 | 7 | - -## XGBoost Model Performance on eICU Tasks - -### eICU Sweep - -The eICU sweep was conducted equivalently to the MIMIC-IV sweep. Please refer to the MIMIC-IV Sweep subsection above for details on the commands and sweep parameters. - -For more details about eICU-specific task generation and running, see the [eICU companion repository](https://github.com/mmcdermott/MEDS_TAB_EICU). - -#### 1. XGBoost Performance on eICU - -| Task | Index Timestamp | AUC | Minimum Code Inclusion Frequency | Window Sizes | Aggregations | -| ------------------------------- | ----------------- | ----- | -------------------------------- | ------------------------ | -------------------------------------------------------------- | -| Post-discharge 30 day Mortality | Discharge | 0.603 | 68,235 | \[12h,1d,full\] | \[code/count,value/sum_sqd,value/max\] | -| Post-discharge 1 year Mortality | Discharge | 0.875 | 3,280 | \[30d,365d\] | \[static/present,value/sum,value/sum_sqd,value/min,value/max\] | -| In Hospital Mortality | Admission + 24 hr | 0.855 | 335,912 | \[2h,7d,30d,365d,full\] | \[static/present,code/count,value/count,value/min,value/max\] | -| In Hospital Mortality | Admission + 48 hr | 0.570 | 89,121 | \[12h,1d,30d\] | \[code/count,value/count,value/min\] | -| LOS in ICU > 3 days | Admission + 24 hr | 0.783 | 7,881 | \[1d,30d,full\] | \[static/present,code/count,value/count,value/sum,value/max\] | -| LOS in ICU > 3 days | Admission + 48 hr | 0.757 | 1,719 | \[2h,12h,7d,30d,full\] | \[code/count,value/count,value/sum,value/sum_sqd,value/min\] | -| LOS in Hospital > 3 days | Admission + 24 hr | 0.864 | 160 | \[1d,30d,365d,full\] | \[static/present,code/count,value/min,value/max\] | -| LOS in Hospital > 3 days | Admission + 48 hr | 0.895 | 975 | \[12h,1d,30d,365d,full\] | \[code/count,value/count,value/sum,value/sum_sqd\] | - -#### 2. XGBoost Optimal Found Model Parameters - -| Task | Index Timestamp | Eta | Lambda | Alpha | Subsample | Minimum Child Weight | Number of Boosting Rounds | Early Stopping Rounds | Max Tree Depth | -| ------------------------------- | ----------------- | ----- | ------ | ----- | --------- | -------------------- | ------------------------- | --------------------- | -------------- | -| In Hospital Mortality | Admission + 24 hr | 0.043 | 0.001 | 0.343 | 0.879 | 13 | 574 | 9 | 14 | -| In Hospital Mortality | Admission + 48 hr | 0.002 | 0.002 | 0.303 | 0.725 | 0 | 939 | 9 | 12 | -| LOS in ICU > 3 days | Admission + 24 hr | 0.210 | 0.189 | 0.053 | 0.955 | 5 | 359 | 6 | 14 | -| LOS in ICU > 3 days | Admission + 48 hr | 0.340 | 0.393 | 0.004 | 0.900 | 6 | 394 | 10 | 13 | -| LOS in Hospital > 3 days | Admission + 24 hr | 0.026 | 0.238 | 0.033 | 0.940 | 46 | 909 | 5 | 11 | -| LOS in Hospital > 3 days | Admission + 48 hr | 0.100 | 0.590 | 0.015 | 0.914 | 58 | 499 | 10 | 9 | -| Post-discharge 30 day Mortality | Discharge | 0.003 | 0.0116 | 0.001 | 0.730 | 13 | 986 | 7 | 7 | -| Post-discharge 1 year Mortality | Discharge | 0.005 | 0.006 | 0.002 | 0.690 | 93 | 938 | 6 | 14 | - -#### 3. eICU Task Specific Training Cohort Size - -| Task | Index Timestamp | Number of Patients | Number of Events | -| ------------------------------- | ----------------- | ------------------ | ---------------- | -| Post-discharge 30 day Mortality | Discharge | 91,405 | 91,405 | -| Post-discharge 1 year Mortality | Discharge | 91,405 | 91,405 | -| In Hospital Mortality | Admission + 24 hr | 35,85 | 3,585 | -| In Hospital Mortality | Admission + 48 hr | 1,527 | 1,527 | -| LOS in ICU > 3 days | Admission + 24 hr | 12,672 | 14,004 | -| LOS in ICU > 3 days | Admission + 48 hr | 12,712 | 14,064 | -| LOS in Hospital > 3 days | Admission + 24 hr | 99,540 | 99,540 | -| LOS in Hospital > 3 days | Admission + 48 hr | 99,786 | 99,786 | diff --git a/docs/Makefile b/docs/Makefile deleted file mode 100644 index d0c3cbf..0000000 --- a/docs/Makefile +++ /dev/null @@ -1,20 +0,0 @@ -# Minimal makefile for Sphinx documentation -# - -# You can set these variables from the command line, and also -# from the environment for the first two. -SPHINXOPTS ?= -SPHINXBUILD ?= sphinx-build -SOURCEDIR = source -BUILDDIR = build - -# Put it first so that "make" without argument is like "make help". -help: - @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) - -.PHONY: help Makefile - -# Catch-all target: route all unknown targets to Sphinx using the new -# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). -%: Makefile - @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/README.MD b/docs/README.MD new file mode 100644 index 0000000..2aa4eaf --- /dev/null +++ b/docs/README.MD @@ -0,0 +1,35 @@ +# Maintainer Guide + +## Testing + +To run tests, use the following command: + +Run all the fast tests that are fast (and don't use gpu) with: + +```bash +pytest -k "not slow" +``` + +Run all tests with: + +``` +pytest +``` + +______________________________________________________________________ + +## Local Documentation Development + +This section explains how to edit documentation files in the `docs` directory. + +First install docs code + +```bash +pip install -e .[docs] +``` + +Run + +```bash +mkdocs serve +``` diff --git a/docs/assets/dark_purple_meds_tab.png b/docs/assets/dark_purple_meds_tab.png deleted file mode 100644 index 9e890ba..0000000 Binary files a/docs/assets/dark_purple_meds_tab.png and /dev/null differ diff --git a/docs/assets/white_meds_tab.png b/docs/assets/light_logo.png similarity index 100% rename from docs/assets/white_meds_tab.png rename to docs/assets/light_logo.png diff --git a/docs/assets/light_purple_meds_tab.png b/docs/assets/light_purple_meds_tab.png deleted file mode 100644 index dc0b9be..0000000 Binary files a/docs/assets/light_purple_meds_tab.png and /dev/null differ diff --git a/docs/gen_ref_pages.py b/docs/gen_ref_pages.py new file mode 100644 index 0000000..11277d9 --- /dev/null +++ b/docs/gen_ref_pages.py @@ -0,0 +1,83 @@ +"""Generate the code reference pages and navigation.""" + +from pathlib import Path + +import mkdocs_gen_files + +api_nav = mkdocs_gen_files.Nav() +config_nav = mkdocs_gen_files.Nav() + +root = Path(__file__).parent.parent +src = root / "src" / "MEDS_tabular_automl" + + +def process_python_files(): + for path in sorted(src.rglob("*.py")): + # Skip the configs directory for API Reference + if "configs" in path.parts: + continue + + module_path = path.relative_to(src).with_suffix("") + doc_path = path.relative_to(src).with_suffix(".md") + full_doc_path = Path("reference/api") / doc_path + + parts = tuple(module_path.parts) + + if parts[-1] == "__main__": + continue + + md_file_lines = [] + + if parts[-1] == "__init__": + parts = parts[:-1] + doc_path = doc_path.with_name("index.md") + full_doc_path = full_doc_path.with_name("index.md") + + readme_path = path.parent / "README.md" + if readme_path.exists(): + md_file_lines.append(f'--8<-- "{readme_path.relative_to(root)}"') + + if parts: # Only add to navigation if parts is not empty + api_nav[parts] = doc_path.as_posix() + + ident = "MEDS_tabular_automl" + if parts: + ident += "." + ".".join(parts) + md_file_lines.append(f"::: {ident}") + + with mkdocs_gen_files.open(full_doc_path, "w") as fd: + fd.write("\n".join(md_file_lines)) + + mkdocs_gen_files.set_edit_path(full_doc_path, path.relative_to(root)) + + +def process_yaml_files(): + config_dir = src / "configs" + for path in sorted(config_dir.rglob("*.yaml")) + sorted(config_dir.rglob("*.yml")): + rel_path = path.relative_to(config_dir) + doc_path = rel_path.with_suffix(".md") + full_doc_path = Path("reference/config") / doc_path + + parts = tuple(rel_path.parts) + + config_nav[parts] = doc_path.as_posix() + + with mkdocs_gen_files.open(full_doc_path, "w") as fd: + fd.write(f"# {path.stem}\n\n") + fd.write("```yaml\n") + fd.write(path.read_text()) + fd.write("\n```\n") + + mkdocs_gen_files.set_edit_path(full_doc_path, path.relative_to(root)) + + +process_python_files() +process_yaml_files() + +with mkdocs_gen_files.open("reference/api/SUMMARY.md", "w") as nav_file: + nav_file.write("# API Reference\n\n") + nav_file.writelines(api_nav.build_literate_nav()) + +with mkdocs_gen_files.open("reference/config/SUMMARY.md", "w") as nav_file: + nav_file.write("# Config Reference\n\n") + nav_file.writelines(config_nav.build_literate_nav()) diff --git a/docs/generate.sh b/docs/generate.sh deleted file mode 100644 index 087fac6..0000000 --- a/docs/generate.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/usr/bin/env bash -# bash generate.sh - -set -e - -rm -rf build -make html -cd build/html -python -m http.server diff --git a/docs/source/implementation.md b/docs/implementation.md similarity index 87% rename from docs/source/implementation.md rename to docs/implementation.md index e93186a..09425a5 100644 --- a/docs/source/implementation.md +++ b/docs/implementation.md @@ -29,12 +29,12 @@ This initial stage processes a pre-shareded dataset. We expect a structure as fo ... ``` -We then compute and store feature frequencies, crucial for determining which features are relevant for further analysis. +We then compute and store feature counts, crucial for determining which features are relevant for further analysis. **Detailed Workflow:** - **Data Loading and Sharding**: We iterate through shards to compute feature frequencies for each shard. -- **Frequency Aggregation**: After computing frequencies across shards, we aggregate them to get a final count of each feature across the entire dataset training dataset, which allows us to filter out infrequent features in the tabularization stage or when tuning XGBoost. +- **Count Aggregation**: After computing feature counts across shards, we aggregate them to get a final count of each feature across the entire dataset training dataset, which allows us to filter out infrequent features in the tabularization stage or when tuning XGBoost. ## 2. Tabularization of Time-Series Data @@ -92,7 +92,7 @@ Now that we have generated tabular features for all the events in our dataset, w - **Row Selection Based on Tasks**: Only the data rows that are relevant to the specific tasks are selected and cached. This reduces the memory footprint and speeds up the training process. - **Use of Sparse Matrices for Efficient Storage**: Sparse matrices are again employed here to store the selected data efficiently, ensuring that only non-zero data points are kept in memory, thus optimizing both storage and retrieval times. -The file structure for the cached data mirrors that of the tabular data, also consisting of `.npz` files, where users must specify the directory that stores labels. Labels follow the same shard filestructure as the input meds data from step (1), and the label parquets need `subject_id`, `timestamp`, and `label` columns. +The file structure for the cached data mirrors that of the tabular data, also consisting of `.npz` files, where users must specify the directory that stores labels. Labels must follow the [MEDS label-schema](https://github.com/Medical-Event-Data-Standard/meds?tab=readme-ov-file#the-label-schema), specifically including the `subject_id`, `prediction_time`, and `boolean_value` columns which are necessary for binary classification tasks. ## 4. XGBoost Training @@ -102,4 +102,4 @@ The final stage uses the processed and cached data to train an XGBoost model. Th - **Iterator for Data Loading**: Custom iterators are designed to load sparse matrices efficiently into the XGBoost training process, which can handle sparse inputs natively, thus maintaining high computational efficiency. - **Training and Validation**: The model is trained using the tabular data, with evaluation steps that include early stopping to prevent overfitting and tuning of hyperparameters based on validation performance. -- **Hyperaparameter Tuning**: We use [optuna](https://optuna.org/) to tune over XGBoost model pramters, aggregations, window sizes, and the minimimum code inclusion frequency. +- **Hyperaparameter Tuning**: We use [optuna](https://optuna.org/) to tune over XGBoost model pramters, aggregations, window sizes, and the minimimum code inclusion count. diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000..612c7a5 --- /dev/null +++ b/docs/index.md @@ -0,0 +1 @@ +--8<-- "README.md" diff --git a/docs/javascripts/mathjax.js b/docs/javascripts/mathjax.js new file mode 100644 index 0000000..7e48906 --- /dev/null +++ b/docs/javascripts/mathjax.js @@ -0,0 +1,19 @@ +window.MathJax = { + tex: { + inlineMath: [["\\(", "\\)"]], + displayMath: [["\\[", "\\]"]], + processEscapes: true, + processEnvironments: true + }, + options: { + ignoreHtmlClass: ".*|", + processHtmlClass: "arithmatex" + } +}; + +document$.subscribe(() => { + MathJax.startup.output.clearCache() + MathJax.typesetClear() + MathJax.texReset() + MathJax.typesetPromise() +}) diff --git a/docs/make.bat b/docs/make.bat deleted file mode 100644 index 319c288..0000000 --- a/docs/make.bat +++ /dev/null @@ -1,35 +0,0 @@ -@ECHO OFF - -pushd %~dp0 - -REM Command file for Sphinx documentation - -if "%SPHINXBUILD%" == "" ( - set SPHINXBUILD=sphinx-build -) -set SOURCEDIR=source -set BUILDDIR=build - -if "%1" == "" goto help - -%SPHINXBUILD% >NUL 2>NUL -if errorlevel 9009 ( - echo. - echo.The 'sphinx-build' command was not found. Make sure you have Sphinx - echo.installed, then set the SPHINXBUILD environment variable to point - echo.to the full path of the 'sphinx-build' executable. Alternatively you - echo.may add the Sphinx directory to PATH. - echo. - echo.If you don't have Sphinx installed, grab it from - echo.http://sphinx-doc.org/ - exit /b 1 -) - -%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% -goto end - -:help -%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% - -:end -popd diff --git a/docs/overrides/main.html b/docs/overrides/main.html new file mode 100644 index 0000000..66b1270 --- /dev/null +++ b/docs/overrides/main.html @@ -0,0 +1,5 @@ +{% extends "base.html" %} + +{% block announce %} +This is a community-supported tool. If you'd like to contribute, check out our GitHub repository. Your contributions are welcome! +{% endblock %} diff --git a/docs/overview.md b/docs/overview.md new file mode 100644 index 0000000..ffbb7fb --- /dev/null +++ b/docs/overview.md @@ -0,0 +1,276 @@ + +# Core CLI Scripts Overview +We provide a set of core CLI scripts to facilitate the tabularization and modeling of MEDS data. These scripts are designed to be run in sequence to transform raw MEDS data into tabularized data and train a model on the tabularized data. The following is a high-level overview of the core CLI scripts: + +#### 1. **`MEDS_transform-reshard_to_split`**: + +This optional command reshards the data. A core challenge in tabularization is the high memory usage and slow compute time. We shard the data into small shards to reduce the memory usage as we can independently tabularize each shard, and we can reduce cpu time by parallelizing the processing of these shards across workers that are independently processing different shards. + +```bash +MEDS_transform-reshard_to_split \ + --multirun \ + worker="range(0,6)" \ + hydra/launcher=joblib \ + input_dir="$MEDS_DIR" \ + cohort_dir="$MEDS_RESHARD_DIR" \ + 'stages=["reshard_to_split"]' \ + stage="reshard_to_split" \ + stage_configs.reshard_to_split.n_subjects_per_shard=2500 +``` + +??? note "Args Description" + + - `--multirun`: This is an optional argument to specify that the command should be run in parallel. We use this here to parallelize the resharing of the data. + - `hydra/launcher`: This is an optional argument to specify the launcher. When using multirun you should specify the launcher. We use joblib here which enables parallelization on a single machine. + - `worker`: When using joblib or a hydra slurm launcher, the range of workers must be defined as it specifies the number of parallel workers to spawn. We use 6 workers here. + - `input_dir`: The directory containing the MEDS data. + - `cohort_dir`: The directory to store the resharded data. + - `stages`: The stages to run. We only run the reshard_to_split stage here. MEDS Transform allows for a sequence of stages to be defined an run which is why this is a list. + - `stage`: The specific stage to run. We run the reshard_to_split stage here. It must be one of the stages in the `stages` kwarg list. + - `stage_configs.reshard_to_split.n_subjects_per_shard`: The number of subjects per shard. We use 2500 subjects per shard here. + +For the rest of the tutorial we will assume that the data has been reshared into the `MEDS_RESHARD_DIR` directory, but this step is optional, and you could instead use the original data directory, `MEDS_DIR`. If you experience high memory issues in later stages, you should try reducing `stage_configs.reshard_to_split.n_subjects_per_shard` to a smaller number. + + +#### 2. **`meds-tab-describe`**: + +This command processes MEDS data shards to compute the frequencies of different code types. It differentiates codes into the following categories: + + +* dynamic codes (codes with timestamps) +* dynamic numeric values (codes with timestamps and numerical values) +* static codes (codes without timestamps) +* static numeric values (codes without timestamps but with numerical values). + + This script further caches feature names and frequencies in a dataset stored in a `code_metadata.parquet` file within the `input_dir` argument specified as a hydra-style command line argument. + +```bash +meds-tab-describe \ + "input_dir=${MEDS_RESHARD_DIR}/data" "output_dir=$OUTPUT_TABULARIZATION_DIR" +``` +This stage is not parallelized as it runs very quickly. + +??? note "Args Description" + + - `input_dir`: The directory containing the MEDS data. + - `output_dir`: The directory to store the tabularized data. + +#### 3. **`meds-tab-tabularize-static`**: Filters and processes the dataset based on the count of codes, generating a tabular vector for each patient at each timestamp in the shards. Each row corresponds to a unique `subject_id` and `timestamp` combination, thus rows are duplicated across multiple timestamps for the same patient. + + **Example: Tabularizing static data** with the minimum code count of 10, window sizes of `[1d, 30d, 365d, full]`, and value aggregation methods of `[static/present, static/first, code/count, value/count, value/sum, value/sum_sqd, value/min, value/max]` + + ```console + meds-tab-tabularize-static input_dir="path_to_data" \ + tabularization.min_code_inclusion_count=10 \ + tabularization.window_sizes=[1d,30d,365d,full] \ + do_overwrite=False \ + tabularization.aggs=[static/present,static/first,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max]" + ``` + + - For the exhaustive examples of value aggregations, see [`/src/MEDS_tabular_automl/utils.py`](https://github.com/mmcdermott/MEDS_Tabular_AutoML/blob/main/src/MEDS_tabular_automl/utils.py#L24) + +!!! note + + In addition to `min_code_inclusion_count` there are several other parameters that can be set tabularization to restrict the codes that are included in the tabularized data. These are: + * `allowed_codes`: a list of codes to include in the tabularized data + * `min_code_inclusion_count`: The minimum number of times a code must appear in the data to be included in the tabularized data + * `min_code_inclusion_frequency` The minimum normalized frequency (i.e. normalized by dividing the code's count by the total number of observations across all codes in the dataset) required for a code to be included. + * `max_included_codes`: The maximum number of codes to include in the tabularized data + + +```bash +meds-tab-tabularize-static \ + "input_dir=${MEDS_RESHARD_DIR}/data" "output_dir=$OUTPUT_TABULARIZATION_DIR" \ +``` + +This stage is not parallelized as it runs very quickly. +??? note "Args Description" + - `input_dir`: The directory containing the MEDS data. + - `output_dir`: The directory to store the tabularized data. + + +#### 4. **`meds-tab-tabularize-time-series`**: + +Iterates through combinations of a shard, `window_size`, and `aggregation` to generate feature vectors that aggregate patient data for each unique `subject_id` x `time`. This stage (and the previous stage) uses sparse matrix formats to efficiently handle the computational and storage demands of rolling window calculations on large datasets. We support parallelization through Hydra's [`--multirun`](https://hydra.cc/docs/intro/#multirun) flag and the [`joblib` launcher](https://hydra.cc/docs/plugins/joblib_launcher/#internaldocs-banner). + + **Example: Aggregate time-series data** on features across different `window_sizes` + + +```bash +meds-tab-tabularize-time-series \ + --multirun \ + worker="range(0,$N_PARALLEL_WORKERS)" \ + hydra/launcher=joblib \ + "input_dir=${MEDS_RESHARD_DIR}/data" "output_dir=$OUTPUT_TABULARIZATION_DIR" \ + tabularization.min_code_inclusion_count=10 \ + tabularization.window_sizes=[1d,30d,365d,full] \ + tabularization.aggs=[static/present,static/first,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max] +``` + +!!! warning + + This stage is the most memory intensive stage! This stage should be parallelized to speed up the processing of the data. If you run out of memory, either reduce the workers or reshard your data with `MEDS_transform-reshard_to_split` setting `stage_configs.reshard_to_split.n_subjects_per_shard` to a smaller number. This stage is also one of the the slowest stages. + +!!! warning + + You must use the same code inclusion parameters (which in this example is just `tabularization.min_code_inclusion_count`) as in the previous stage, `meds-tab-tabularize-static`, to ensure that the same codes are included in the tabularized data. + +??? note "Args Description" + + - `--multirun`: This is an optional argument to specify that the command should be run in parallel. We use this here to parallelize the resharing of the data. + - `hydra/launcher`: This is an optional argument to specify the launcher. When using multirun you should specify the launcher. We use joblib here which enables parallelization on a single machine. + - `worker`: When using joblib or a hydra slurm launcher, the range of workers must be defined as it specifies the number of parallel workers to spawn. We use `$N_PARALLEL_WORKERS` workers here. + - `input_dir`: The directory containing the MEDS data. + - `output_dir`: The directory to store the tabularized data. + - `tabularization.min_code_inclusion_count`: The minimum code inclusion frequency. We use 10 here, so only codes that appear at least 10 times in the data will be included. + - `tabularization.window_sizes`: The window sizes to use. We use `[1d,30d,365d,full]` here. This means we will generate features for the last day, last 30 days, last 365 days, and the full history of the patient. + - `tabularization.aggs`: The aggregation functions to use. We use `[static/present,static/first,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max]` here. This means we will generate features for the presence of a static code, the value of a static code, the count of dynamic codes, the count of dynamic values, the sum of dynamic values, the sum of squared dynamic values, the minimum dynamic value, and the maximum dynamic value. + + + +5. **`meds-tab-cache-task`**: + +Aligns task-specific labels with the nearest prior event in the tabularized data. It requires a labeled dataset directory with three columns (`subject_id`, `timestamp`, `label`) structured similarly to the `input_dir`. + + **Example: Align tabularized data** for a specific task `$TASK` and labels that has pulled from [ACES](https://github.com/justin13601/ACES) + + ```console + meds-tab-cache-task input_dir="path_to_data" \ + task_name=$TASK \ + tabularization.min_code_inclusion_count=10 \ + tabularization.window_sizes=[1d,30d,365d,full] \ + tabularization.aggs=[static/present,static/first,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max] + ``` + + +```bash +meds-tab-cache-task \ + --multirun \ + hydra/launcher=joblib \ + worker="range(0,$N_PARALLEL_WORKERS)" \ + "input_dir=${MEDS_RESHARD_DIR}/data" "output_dir=$OUTPUT_TABULARIZATION_DIR" \ + "input_label_dir=${TASKS_DIR}/${TASK}/" "task_name=${TASK}" + tabularization.min_code_inclusion_count=10 \ + tabularization.window_sizes=[1d,30d,365d,full] \ + tabularization.aggs=[static/present,static/first,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max] +``` + +!!! warning + + This stage is the slowest stage, but should not be as memory intensive, so make sure to parallelize across as many workers as possible. + +!!! warning + + You must use the same code inclusion parameters (which in this example is just `tabularization.min_code_inclusion_count`) as in the previous stages, `meds-tab-tabularize-static` and `meds-tab-tabularize-time-series`, to ensure that the same codes are included in the tabularized data. + +??? note "Args Description" + + - `--multirun`: This is an optional argument to specify that the command should be run in parallel. We use this here to parallelize the resharing of the data. + - `hydra/launcher`: This is an optional argument to specify the launcher. When using multirun you should specify the launcher. We use joblib here which enables parallelization on a single machine. + - `worker`: When using joblib or a hydra slurm launcher, the range of workers must be defined as it specifies the number of parallel workers to spawn. We use `$N_PARALLEL_WORKERS` workers here. + - `input_dir`: The directory containing the MEDS data. + - `output_dir`: The directory to store the tabularized data. + - `input_label_dir`: The directory containing the labels (following the [meds label-schema](https://github.com/Medical-Event-Data-Standard/meds?tab=readme-ov-file#the-label-schema)) for the task. + - `task_name`: The name of the task to cache the labels for. + - `tabularization.min_code_inclusion_count`: The minimum code inclusion frequency. + - `tabularization.window_sizes`: The window sizes to use. + - `tabularization.aggs`: The aggregation functions to use. + + +#### 6. **`meds-tab-model`**: + +Trains a tabular model using user-specified parameters. You can train a single xgboost model with the following command: +```bash +meds-tab-model \ + model_launcher=xgboost \ + "input_dir=${MEDS_RESHARD_DIR}/data" "output_dir=$OUTPUT_TABULARIZATION_DIR" \ + "output_model_dir=${OUTPUT_MODEL_DIR}/${TASK}/" "task_name=$TASK" \ + tabularization.min_code_inclusion_count=10 \ + "tabularization.window_sizes=[1d,30d,365d,full]" \ + "tabularization.aggs=[static/present,static/first,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max]" +``` + +??? note "Args Description" + - `model_launcher`: The launcher to use for the model. choose one in `xgboost`, `knn_classifier`, `logistic_regression`, `random_forest_classifier`, `sgd_classifier`. + - `input_dir`: The directory containing the MEDS data. + - `output_dir`: The directory to store the tabularized data. + - `output_model_dir`: The directory to store the model. + - `hydra.sweeper.n_trials`: The number of trials to run in the hyperparameter sweep. + - `hydra.sweeper.n_jobs`: The number of parallel jobs to run in the hyperparameter sweep. + - `task_name`: The name of the task to cache the labels for. + - `tabularization.min_code_inclusion_count`: The minimum code inclusion frequency. + - `tabularization.window_sizes`: The window sizes to use. + - `tabularization.aggs`: The aggregation functions to use. + + +You can also run an [optuna](https://optuna.org/) hyperparameter sweep by adding the `--multirun` flag and can control the number of trials with `hydra.sweeper.n_trials` and parallel jobs with `hydra.sweeper.n_jobs`: + +```bash +meds-tab-model \ + --multirun \ + model_launcher=xgboost \ + "input_dir=${MEDS_RESHARD_DIR}/data" "output_dir=$OUTPUT_TABULARIZATION_DIR" \ + "output_model_dir=${OUTPUT_MODEL_DIR}/${TASK}/" "task_name=$TASK" \ + "hydra.sweeper.n_trials=1000" "hydra.sweeper.n_jobs=${N_PARALLEL_WORKERS}" \ + tabularization.min_code_inclusion_count=10 \ + tabularization.window_sizes=$(generate-subsets [1d,30d,365d,full]) \ + tabularization.aggs=$(generate-subsets [static/present,static/first,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max]) +``` + +??? note "Args Description" + - `multirun`: This is a required argument when sweeping and specifies that we are performing a hyperparameter sweep and using optuna. + - `model_launcher`: The launcher to use for the model. choose one in `xgboost`, `knn_classifier`, `logistic_regression`, `random_forest_classifier`, `sgd_classifier`. + - `input_dir`: The directory containing the MEDS data. + - `output_dir`: The directory to store the tabularized data. + - `output_model_dir`: The directory to store the model. + - `hydra.sweeper.n_trials`: The number of trials to run in the hyperparameter sweep. + - `hydra.sweeper.n_jobs`: The number of parallel jobs to run in the hyperparameter sweep. + - `task_name`: The name of the task to cache the labels for. + - `tabularization.min_code_inclusion_count`: The minimum code inclusion frequency. + - `tabularization.window_sizes`: The window sizes to use. + - `tabularization.aggs`: The aggregation functions to use. + + +??? note "Why `generate-subsets`?" + **`generate-subsets`**: Generates and prints a sorted list of all non-empty subsets from a comma-separated input. This is provided for the convenience of sweeping over all possible combinations of window sizes and aggregations. + + For example, you can directly call **`generate-subsets`** in the command line: + + ```console + generate-subsets [2,3,4] \ + [2], [2, 3], [2, 3, 4], [2, 4], [3], [3, 4], [4] + ``` + + This could be used in the command line in concert with other calls. For example, the following call: + + ```console + meds-tab-model --multirun tabularization.window_sizes=$(generate-subsets [1d,2d,7d,full]) + ``` + + would resolve to: + + ```console + meds-tab-model --multirun tabularization.window_sizes=[1d],[1d,2d],[1d,2d,7d],[1d,2d,7d,full],[1d,2d,full],[1d,7d],[1d,7d,full],[1d,full],[2d],[2d,7d],[2d,7d,full],[2d,full],[7d],[7d,full],[full] + ``` + + which can then be correctly interpreted by Hydra's multirun logic to sweep over all possible combinations of window sizes, during hyperparameter tuning! + + + +!!! note "Code Inclusion Parameters" + + In this modeling stage, you can change the code inclusion parameters from the previous tabularization and task caching stages, and treat them as a tunable hyperparameter + + In addition to the previously defined code inclusion parameters, there are two others that we allow only in modeling (as they are task specific): + * `min_correlation`: The minimum correlation a code must have with the target to be included in the tabularized data + * `max_by_correlation`: The maximum number of codes to include in the tabularized data based on correlation with the target. Specifically we sort the codes by correlation with the target and include the top `max_by_correlation` codes. + +??? example "Experimental Feature" + + We also support an autogluon based hyperparameter and model search: + ```bash + meds-tab-autogluon model_launcher=autogluon \ + "input_dir=${MEDS_RESHARD_DIR}/data" "output_dir=$OUTPUT_TABULARIZATION_DIR" \ + "output_model_dir=${OUTPUT_MODEL_DIR}/${TASK}/" "task_name=$TASK" \ + ``` + run `meds-tab-autogluon model_launcher=autogluon --help` to see all kwargs. Autogluon requires a lot of memory as it makes all the sparse matrices dense, and is not recommended for large datasets. diff --git a/docs/source/prediction.md b/docs/prediction.md similarity index 95% rename from docs/source/prediction.md rename to docs/prediction.md index 18f19c0..84545bb 100644 --- a/docs/source/prediction.md +++ b/docs/prediction.md @@ -13,18 +13,18 @@ We optimize predictive accuracy and model performance by using varied window siz A single XGBoost run was completed to profile time and memory usage. This was done for each `$TASK` using the following command: ```console -meds-tab-xgboost +meds-tab-model input_dir="path_to_data" \ task_name=$TASK \ output_dir="output_directory" \ do_overwrite=False \ ``` -This uses the defaults minimum code inclusion frequency, window sizes, and aggregations from the `launch_xgboost.yaml`: +This uses the defaults minimum code inclusion count, window sizes, and aggregations from the `launch_xgboost.yaml`: ```yaml -allowed_codes: # allows all codes that meet min code inclusion frequency -min_code_inclusion_frequency: 10 +allowed_codes: # allows all codes that meet min code inclusion count +min_code_inclusion_count: 10 window_sizes: - 1d - 7d @@ -83,7 +83,7 @@ To better understand the runtimes, we also report the task specific cohort size. The XGBoost sweep was run using the following command for each `$TASK`: ```console -meds-tab-xgboost --multirun \ +meds-tab-model --multirun \ input_dir="path_to_data" \ task_name=$TASK \ output_dir="output_directory" \ @@ -115,7 +115,7 @@ params: model.max_depth: range(2, 16) num_boost_round: range(100, 1000) early_stopping_rounds: range(1, 10) - tabularization.min_code_inclusion_frequency: tag(log, range(10, 1000000)) + tabularization.min_code_inclusion_count: tag(log, range(10, 1000000)) ``` Note that the XGBoost command shown includes `tabularization.window_sizes` and ` tabularization.aggs` in the parameters to sweep over. @@ -124,7 +124,7 @@ For a complete example on MIMIC-IV and for all of our config files, see the [MIM #### 2.1 XGBoost Performance on MIMIC-IV -| Task | Index Timestamp | AUC | Minimum Code Inclusion Frequency | Number of Included Codes\* | Window Sizes | Aggregations | +| Task | Index Timestamp | AUC | Minimum Code Inclusion Count | Number of Included Codes\* | Window Sizes | Aggregations | | ------------------------------- | ----------------- | ----- | -------------------------------- | -------------------------- | ---------------------- | --------------------------------------------------------------------------- | | Post-discharge 30 day Mortality | Discharge | 0.935 | 1,371 | 5,712 | \[7d,full\] | \[code/count,value/count,value/min,value/max\] | | Post-discharge 1 year Mortality | Discharge | 0.898 | 289 | 10,048 | \[2h,12h,1d,30d,full\] | \[static/present,code/count,value/sum_sqd,value/min\] | @@ -138,7 +138,7 @@ For a complete example on MIMIC-IV and for all of our config files, see the [MIM | LOS in Hospital > 3 days | Admission + 24 hr | 0.943 | 94,633 | 912 | \[12h,1d,7d\] | \[code/count,value/count,value/sum_sqd\] | | LOS in Hospital > 3 days | Admission + 48 hr | 0.945 | 30,880 | 1,619 | \[1d,7d,30d\] | \[code/count,value/sum,value/min,value/max\] | -- Number of Included Codes is based on Minimum Code Inclusion Frequency -- we calculated the number of resulting codes that were above the minimum threshold and reported that. +- Number of Included Codes is based on Minimum Code Inclusion Count -- we calculated the number of resulting codes that were above the minimum threshold and reported that. #### 2.2 XGBoost Optimal Found Model Parameters @@ -168,7 +168,7 @@ For more details about eICU specific task generation and running, see the [eICU #### 1. XGBoost Performance on eICU -| Task | Index Timestamp | AUC | Minimum Code Inclusion Frequency | Window Sizes | Aggregations | +| Task | Index Timestamp | AUC | Minimum Code Inclusion Count | Window Sizes | Aggregations | | ------------------------------- | ----------------- | ----- | -------------------------------- | ------------------------ | -------------------------------------------------------------- | | Post-discharge 30 day Mortality | Discharge | 0.603 | 68,235 | \[12h,1d,full\] | \[code/count,value/sum_sqd,value/max\] | | Post-discharge 1 year Mortality | Discharge | 0.875 | 3,280 | \[30d,365d\] | \[static/present,value/sum,value/sum_sqd,value/min,value/max\] | diff --git a/docs/source/profiling.md b/docs/profiling.md similarity index 100% rename from docs/source/profiling.md rename to docs/profiling.md diff --git a/docs/requirements.txt b/docs/requirements.txt deleted file mode 100644 index 7f214a7..0000000 --- a/docs/requirements.txt +++ /dev/null @@ -1,14 +0,0 @@ -sphinx==7.1.2 -sphinx-rtd-theme==1.3.0rc1 -sphinx-collections -sphinx_immaterial -sphinx_subfigure -nbsphinx -myst_parser -pypandoc -linkify-it-py -omegaconf -ipywidgets -ipykernel -ipython -pydata-sphinx-theme diff --git a/docs/source/_static/switcher.json b/docs/source/_static/switcher.json deleted file mode 100644 index 6f20b9d..0000000 --- a/docs/source/_static/switcher.json +++ /dev/null @@ -1,12 +0,0 @@ -[ - { - "version": "dev", - "url": "https://meds-tab.readthedocs.io/en/latest/" - }, - { - "name": "0.0.4 (stable)", - "version": "v0.0.4", - "url": "https://meds-tab.readthedocs.io/en/stable/", - "preferred": true - } - ] diff --git a/docs/source/conf.py b/docs/source/conf.py deleted file mode 100644 index bac8f33..0000000 --- a/docs/source/conf.py +++ /dev/null @@ -1,398 +0,0 @@ -import os -import shutil -import sys -from pathlib import Path - -import MEDS_tabular_automl - -# Configuration file for the Sphinx documentation builder. -# -# For the full list of built-in configuration values, see the documentation: -# https://www.sphinx-doc.org/en/master/usage/configuration.html - -# -- Project information ----------------------------------------------------- -# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information - -project = "MEDS-Tab" -copyright = "2024, Nassim Oufattole, Matthew McDermott, Teya Bergamaschi, Aleksia Kolo, Hyewon Jeong" -author = "Nassim Oufattole, Matthew McDermott, Teya Bergamaschi, Aleksia Kolo, Hyewon Jeong" -# Define the json_url for our version switcher. - - -json_url = "https://meds-tab.readthedocs.io/en/latest/_static/switcher.json" -# Define the version we use for matching in the version switcher. -version_match = os.environ.get("READTHEDOCS_VERSION") -release = MEDS_tabular_automl.__version__ -# If READTHEDOCS_VERSION doesn't exist, we're not on RTD -# If it is an integer, we're in a PR build and the version isn't correct. -# If it's "latest" → change to "dev" (that's what we want the switcher to call it) -if not version_match or version_match.isdigit(): - # For local development, infer the version to match from the package. - if "dev" in release or "rc" in release: - version_match = "dev" - # We want to keep the relative reference if we are in dev mode - # but we want the whole url if we are effectively in a released version - json_url = "_static/switcher.json" - else: - version_match = f"v{release}" -elif version_match == "latest": - version_match = "dev" -elif version_match == "stable": - version_match = f"v{release}" - -# Configuration file for the Sphinx documentation builder. -# -# This file only contains a selection of the most common options. For a full -# list see the documentation: -# https://www.sphinx-doc.org/en/master/usage/configuration.html - -language = "en" -# -- Path setup - -__location__ = Path(os.path.dirname(__file__)) -__src__ = __location__ / "../.." - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -sys.path.insert(0, str(__src__)) - - -def ensure_pandoc_installed(_): - """Source: https://stackoverflow.com/questions/62398231/building-docs-fails-due-to-missing-pandoc""" - import pypandoc - - # Download pandoc if necessary. If pandoc is already installed and on - # the PATH, the installed version will be used. Otherwise, we will - # download a copy of pandoc into docs/bin/ and add that to our PATH. - pandoc_dir = str(__location__ / "bin") - # Add dir containing pandoc binary to the PATH environment variable - if pandoc_dir not in os.environ["PATH"].split(os.pathsep): - os.environ["PATH"] += os.pathsep + pandoc_dir - - pypandoc.ensure_pandoc_installed( - targetfolder=pandoc_dir, - delete_installer=True, - ) - - -# -- Run sphinx-apidoc -# This ensures we don't need to run apidoc manually. - -# TODO: use https://github.com/sphinx-extensions2/sphinx-autodoc2 - -from sphinx.ext import apidoc - -output_dir = __location__ / "api" -module_dir = __src__ / "src/MEDS_tabular_automl" -if output_dir.is_dir(): - shutil.rmtree(output_dir) - -try: - cmd_line = f"--implicit-namespaces -e -f -o {output_dir} {module_dir}" - apidoc.main(cmd_line.split(" ")) -except Exception as e: # pylint: disable=broad-except - print(f"Running `sphinx-apidoc {cmd_line}` failed!\n{e}") - - -# -- General configuration - - -# -- Project information -extensions = [ - "sphinx.ext.duration", - "sphinx.ext.doctest", - "sphinx.ext.autodoc", - "sphinx.ext.autosummary", - "sphinx.ext.intersphinx", - "sphinx.ext.todo", - "sphinx.ext.viewcode", - "sphinx.ext.coverage", - "sphinx.ext.ifconfig", - "sphinx.ext.mathjax", - "sphinx.ext.napoleon", - "sphinx.ext.imgconverter", - "sphinxcontrib.collections", - "sphinx_subfigure", - "myst_parser", - "nbsphinx", - # "sphinx_immaterial", -] -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -html_theme = "pydata_sphinx_theme" -# html_sidebars = {"**": []} # ["logo-text.html", "globaltoc.html", "localtoc.html", "searchbox.html"] -html_sidebars = { - "api/*": [ - "sidebar-nav-bs", - ], - "**": [], -} -nbsphinx_allow_errors = True - - -collections_dir = __location__ / "_collections" -if not collections_dir.is_dir(): - os.mkdir(collections_dir) - -python_version = ".".join(map(str, sys.version_info[0:2])) -intersphinx_mapping = { - "sphinx": ("https://www.sphinx-doc.org/en/master", None), - "python": ("https://docs.python.org/" + python_version, None), - "matplotlib": ("https://matplotlib.org", None), - "numpy": ("https://numpy.org/doc/stable", None), - "sklearn": ("https://scikit-learn.org/stable", None), - "pandas": ("https://pandas.pydata.org/docs", None), - "pandera": ("https://pandera.readthedocs.io/en/stable", None), - "scipy": ("https://docs.scipy.org/doc/scipy/reference", None), - "setuptools": ("https://setuptools.pypa.io/en/stable/", None), - "pyscaffold": ("https://pyscaffold.org/en/stable", None), - "hyperimpute": ("https://hyperimpute.readthedocs.io/en/latest/", None), - "xgbse": ("https://loft-br.github.io/xgboost-survival-embeddings/", None), - "lifelines": ("https://lifelines.readthedocs.io/en/stable/", None), - "optuna": ("https://optuna.readthedocs.io/en/stable/", None), -} -intersphinx_disabled_domains = ["std"] - -templates_path = ["_templates"] - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -# This pattern also affects html_static_path and html_extra_path. -exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] - -# Control options for included jupyter notebooks. -nb_execution_mode = "off" - - -# -- Options for HTML output - -# Configure MyST-Parser -myst_enable_extensions = [ - "amsmath", - "colon_fence", - "deflist", - "dollarmath", - "html_image", - "linkify", - "replacements", - "smartquotes", - "substitution", - "tasklist", -] - -myst_update_mathjax = True - -# MyST URL schemes. -myst_url_schemes = { - "http": None, - "https": None, - "ftp": None, - "mailto": None, - "repo-code": "https://github.com/mmcdermott/MEDS_Tabular_AutoML/tree/main/{{path}}#{{fragment}}", - # "doi": "https://doi.org/{{path}}", - # "gh-issue": { - # "url": "https://github.com/executablebooks/MyST-Parser/issue/{{path}}#{{fragment}}", - # "title": "Issue #{{path}}", - # "classes": ["github"], - # }, -} - -# The suffix of source filenames. -source_suffix = [".rst", ".md"] - -# The encoding of source files. -# source_encoding = 'utf-8-sig' - -# The master toctree document. -master_doc = "index" - -# The reST default role (used for this markup: `text`) to use for all documents. -default_role = "py:obj" - -# If true, '()' will be appended to :func: etc. cross-reference text. -# add_function_parentheses = True - -# If true, the current module name will be prepended to all description -# unit titles (such as .. function::). -# add_module_names = True - -# If true, sectionauthor and moduleauthor directives will be shown in the -# output. They are ignored by default. -# show_authors = False - -# The name of the Pygments (syntax highlighting) style to use. -# https://pygments.org/styles/ -pygments_style = "tango" - -# A list of ignored prefixes for module index sorting. -# modindex_common_prefix = [] - -# If true, keep warnings as "system message" paragraphs in the built documents. -# keep_warnings = False - -# If this is True, todo emits a warning for each TODO entries. The default is False. -todo_emit_warnings = True - - -# Theme options are theme-specific and customize the look and feel of a theme -# further. For a list of options available for each theme, see the -# documentation. - - -html_title = f"MEDS-Tab v{release} Documentation" -html_short_title = "MEDS-Tab" - -# html_logo = "query-512.png" -# html_favicon = "query-16.ico" - -# Material theme options (see theme.conf for more information) -html_theme_options = { - "logo": { - "text": "MEDS-TAB", - "image_light": "../assets/dark_purple_meds_tab.png", - "image_dark": "../assets/light_purple_meds_tab.png", - }, - "icon_links": [ - { - "name": "GitHub", - "url": "https://github.com/mmcdermott/MEDS_Tabular_AutoML", # required - "icon": "fa-brands fa-github", - "type": "fontawesome", - }, - { - "name": "PyPI", - "url": "https://pypi.org/project/meds-tab/", - "icon": "fa-brands fa-python", - }, - ], - "header_links_before_dropdown": 6, - "show_toc_level": 1, - "navbar_align": "left", # [left, content, right] For testing that the navbar items align properly - # "show_nav_level": 2, - "announcement": "This is a community-supported tool. If you'd like to contribute, check out our GitHub repository. Your contributions are welcome!", # noqa E501 - "show_version_warning_banner": True, - "switcher": { - "json_url": json_url, - "version_match": version_match, - }, - "navbar_center": ["version-switcher", "navbar-nav"], - "footer_start": ["copyright"], - "footer_center": ["sphinx-version"], - "use_edit_page_button": True, - # "secondary_sidebar_items": { - # "**/*": ["page-toc", "edit-this-page", "sourcelink"], - # }, - "back_to_top_button": True, -} - -html_context = { - "github_user": "mmcdermott", - "github_repo": "MEDS_Tabular_AutoML", - "github_version": "main", - "doc_path": "docs/source", -} - -# html_theme_options = { -# # Set the name of the project to appear in the navigation. -# "nav_title": "MEDS-TAB", -# "palette": {"primary": "purple", "accent": "purple"}, -# # { -# # "media": "(prefers-color-scheme: light)", -# # "scheme": "default", -# # "toggle": { -# # "icon": "material/toggle-switch-off-outline", -# # "name": "Switch to dark mode", -# # }, -# # }, -# # { -# # "media": "(prefers-color-scheme: dark)", -# # "scheme": "slate", -# # "toggle": { -# # "icon": "material/toggle-switch", -# # "name": "Switch to light mode", -# # }, -# # }, -# # "color_primary": "green", -# # "color_accent": "green", -# # Set the repo location to get a badge with stats -# "repo_url": "https://github.com/mmcdermott/MEDS_Tabular_AutoML", -# "repo_name": "meds-tab", -# # Visible levels of the global TOC; -1 means unlimited -# "globaltoc_depth": 3, -# # If False, expand all TOC entries -# "globaltoc_collapse": True, -# # If True, show hidden TOC entries -# "globaltoc_includehidden": False, -# } - - -html_show_copyright = True -htmlhelp_basename = "meds-tab-doc" - -# -- Options for LaTeX output -# latex_engine = "xelatex" -latex_elements = { # type: ignore - # The paper size ("letterpaper" or "a4paper"). - "papersize": "letterpaper", - # The font size ("10pt", "11pt" or "12pt"). - "pointsize": "10pt", - # Additional stuff for the LaTeX preamble. - "preamble": "\n".join( - [ - r"\usepackage{svg}", - r"\DeclareUnicodeCharacter{2501}{-}", - r"\DeclareUnicodeCharacter{2503}{|}", - r"\DeclareUnicodeCharacter{2500}{-}", - r"\DeclareUnicodeCharacter{2550}{-}", - r"\DeclareUnicodeCharacter{2517}{+}", - r"\DeclareUnicodeCharacter{2518}{+}", - r"\DeclareUnicodeCharacter{2534}{+}", - r"\DeclareUnicodeCharacter{250C}{+}", - r"\DeclareUnicodeCharacter{252C}{+}", - r"\DeclareUnicodeCharacter{2510}{+}", - r"\DeclareUnicodeCharacter{2502}{|}", - r"\DeclareUnicodeCharacter{2506}{|}", - r"\DeclareUnicodeCharacter{2561}{|}", - r"\DeclareUnicodeCharacter{256A}{|}", - r"\DeclareUnicodeCharacter{2523}{|}", - r"\DeclareUnicodeCharacter{03BC}{\ensuremath{\mu}}", - r"\DeclareUnicodeCharacter{255E}{|}", - r"\DeclareUnicodeCharacter{255F}{+}", - r"\DeclareUnicodeCharacter{254E}{|}", - r"\DeclareUnicodeCharacter{257C}{-}", - r"\DeclareUnicodeCharacter{257E}{-}", - r"\DeclareUnicodeCharacter{2559}{+}", - ] - ), -} - - -# Grouping the document tree into LaTeX files. List of tuples -# (source start file, target name, title, author, documentclass [howto/manual]). -latex_documents = [ - ( - "index", - "meds_tab_documentation.tex", - "MEDS-TAB Documentation", - r"Matthew McDermott \& Nassim Oufattole \& Teya Bergamaschi", - "manual", - ) -] - -# The name of an image file (relative to this directory) to place at the top of -# the title page. -# latex_logo = "" - -# For "manual" documents, if this is true, then toplevel headings are parts, -# not chapters. -# latex_use_parts = False - -# -- Options for EPUB output -epub_show_urls = "footnote" - -print(f"loading configurations for {project} {release} ...", file=sys.stderr) - - -def setup(app): - app.connect("builder-inited", ensure_pandoc_installed) diff --git a/docs/source/index.md b/docs/source/index.md deleted file mode 100644 index 8f75fb2..0000000 --- a/docs/source/index.md +++ /dev/null @@ -1,39 +0,0 @@ -# Welcome! - -MEDS-Tab is a library designed for automated tabularization, data preparation with aggregation, and time windowing. Check out below for an overview of MEDS-Tab and how it could be useful in your workflows! - -```{toctree} ---- -glob: -maxdepth: 2 ---- -Overview -Pipeline -Memory/CPU Usage -Performance -API -``` - -______________________________________________________________________ - -## Why MEDS-Tab? - -MEDS-Tab is a comprehensive framework designed to streamline the handling, modeling, and analysis of complex medical time-series data. By leveraging automated processes, MEDS-Tab significantly reduces the computation required to generate high-quality baseline models for diverse supervised learning tasks. - -- Cost Efficiency: MEDS-Tab is dramatically more cost-effective compared to existing solutions -- Strong Performance: MEDS-Tab provides robustness and high performance across various datasets compared with other frameworks. - -### I. Transform to MEDS - -MEDS-Tab leverages the recently developed, minimal, easy-to-use Medical Event Data Standard (MEDS) schema to standardize structured EHR data to a consistent schema from which baselines can be reliably produced across arbitrary tasks and settings. In order to use MEDS-Tab, you will first need to transform your raw EHR data to a MEDS format, which can be done using the following libraries: - -- [MEDS Polars](https://github.com/mmcdermott/MEDS_polars_functions) for a set of functions and scripts for extraction to and transformation/pre-processing of MEDS-formatted data. -- [MEDS ETL](https://github.com/Medical-Event-Data-Standard/meds_etl) for a collection of ETLs from common data formats to MEDS. The package library currently supports MIMIC-IV, OMOP v5, and MEDS FLAT (a flat version of MEDS). - -### II. Run MEDS-Tab - -- Run the MEDS-Tab Command-Line Interface tool (`MEDS-Tab-cli`) to extract cohorts based on your task - check out the [Usage Guide](https://meds-tab--36.org.readthedocs.build/en/36/overview.html#core-cli-scripts-overview)! - -- Painless Reproducibility: Use [MEDS-Tab](https://github.com/mmcdermott/MEDS_TAB_MIMIC_IV/tree/main/tasks) to obtain comparable, reproducible, and well-tuned XGBoost results tailored to your dataset-specific feature space! - -By following these steps, you can seamlessly transform your dataset, define necessary criteria, and leverage powerful machine learning tools within the MEDS-Tab ecosystem. This approach not only simplifies the process but also ensures high-quality, reproducible results for your machine learning tasks for health projects. It can reliably take no more than a week of full-time human effort to perform Steps I-V on new datasets in reasonable raw formulations! diff --git a/docs/source/overview.md b/docs/source/overview.md deleted file mode 100644 index 1d453f0..0000000 --- a/docs/source/overview.md +++ /dev/null @@ -1,161 +0,0 @@ -# Usage - -This repository consists of two key pieces: - -1. Construction and efficient loading of tabular (flat, non-longitudinal) summary features describing patient records in MEDS over arbitrary time windows (e.g. 1 year, 6 months, etc.), which go backward in time from a given index date. -2. Running a basic XGBoost AutoML pipeline over these tabular features to predict arbitrary binary classification or regression downstream tasks defined over these datasets. The "AutoML" part of this is not particularly advanced -- what is more advanced is the efficient construction, storage, and loading of tabular features for the candidate AutoML models, enabling a far more extensive search over a much larger total number of features than prior systems. - -## Quick Start - -To use MEDS-Tab, install the dependencies following commands below: - -**Pip Install** - -```console -pip install meds-tab -``` - -**Local Install** - -```console -# clone the git repo -pip install . -``` - -## Scripts and Examples - -For an end-to-end example over MIMIC-IV, see the [MIMIC-IV companion repository](https://github.com/mmcdermott/MEDS_TAB_MIMIC_IV). -For an end-to-end example over Philips eICU, see the [eICU companion repository](https://github.com/mmcdermott/MEDS_TAB_EICU). - -See [`/tests/test_integration.py`](https://github.com/mmcdermott/MEDS_Tabular_AutoML/blob/main/tests/test_integration.py) for a local example of the end-to-end pipeline being run on synthetic data. This script is a functional test that is also run with `pytest` to verify the correctness of the algorithm. - -## Core CLI Scripts Overview - -1. **`meds-tab-describe`**: This command processes MEDS data shards to compute the frequencies of different code types. It differentiates codes into the following categories: - - - time-series codes (codes with timestamps) - - time-series numerical values (codes with timestamps and numerical values) - - static codes (codes without timestamps) - - static numerical codes (codes without timestamps but with numerical values). - - This script further caches feature names and frequencies in a dataset stored in a `code_metadata.parquet` file within the `input_dir` argument specified as a hydra-style command line argument. - -2. **`meds-tab-tabularize-static`**: Filters and processes the dataset based on the frequency of codes, generating a tabular vector for each patient at each timestamp in the shards. Each row corresponds to a unique `subject_id` and `timestamp` combination, thus rows are duplicated across multiple timestamps for the same patient. - - **Example: Tabularizing static data** with the minimum code frequency of 10, window sizes of `[1d, 30d, 365d, full]`, and value aggregation methods of `[static/present, static/first, code/count, value/count, value/sum, value/sum_sqd, value/min, value/max]` - - ```console - meds-tab-tabularize-static input_dir="path_to_data" \ - tabularization.min_code_inclusion_frequency=10 \ - tabularization.window_sizes=[1d,30d,365d,full] \ - do_overwrite=False \ - tabularization.aggs=[static/present,static/first,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max]" - ``` - - - For the exhaustive examples of value aggregations, see [`/src/MEDS_tabular_automl/utils.py`](https://github.com/mmcdermott/MEDS_Tabular_AutoML/blob/main/src/MEDS_tabular_automl/utils.py#L24) - -3. **`meds-tab-tabularize-time-series`**: Iterates through combinations of a shard, `window_size`, and `aggregation` to generate feature vectors that aggregate patient data for each unique `subject_id` x `timestamp`. This stage (and the previous stage) uses sparse matrix formats to efficiently handle the computational and storage demands of rolling window calculations on large datasets. We support parallelization through Hydra's [`--multirun`](https://hydra.cc/docs/intro/#multirun) flag and the [`joblib` launcher](https://hydra.cc/docs/plugins/joblib_launcher/#internaldocs-banner). - - **Example: Aggregate time-series data** on features across different `window_sizes` - - ```console - meds-tab-tabularize-time-series --multirun \ - worker="range(0,$N_PARALLEL_WORKERS)" \ - hydra/launcher=joblib \ - input_dir="path_to_data" \ - tabularization.min_code_inclusion_frequency=10 \ - do_overwrite=False \ - tabularization.window_sizes=[1d,30d,365d,full] \ - tabularization.aggs=[static/present,static/first,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max] - ``` - -4. **`meds-tab-cache-task`**: Aligns task-specific labels with the nearest prior event in the tabularized data. It requires a labeled dataset directory with three columns (`subject_id`, `timestamp`, `label`) structured similarly to the `input_dir`. - - **Example: Align tabularized data** for a specific task `$TASK` and labels that has pulled from [ACES](https://github.com/justin13601/ACES) - - ```console - meds-tab-cache-task input_dir="path_to_data" \ - task_name=$TASK \ - tabularization.min_code_inclusion_frequency=10 \ - do_overwrite=False \ - tabularization.window_sizes=[1d,30d,365d,full] \ - tabularization.aggs=[static/present,static/first,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max] - ``` - -5. **`meds-tab-xgboost`**: Trains an XGBoost model using user-specified parameters. Permutations of `window_sizes` and `aggs` can be generated using `generate-subsets` command (See the section below for descriptions). - - ```console - meds-tab-xgboost --multirun \ - input_dir="path_to_data" \ - task_name=$TASK \ - output_dir="output_directory" \ - tabularization.min_code_inclusion_frequency=10 \ - tabularization.window_sizes=$(generate-subsets [1d,30d,365d,full]) \ - do_overwrite=False \ - tabularization.aggs=$(generate-subsets [static/present,static/first,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max]) - ``` - -## Additional CLI Scripts - -1. **`generate-subsets`**: Generates and prints a sorted list of all non-empty subsets from a comma-separated input. This is provided for the convenience of sweeping over all possible combinations of window sizes and aggregations. - - For example, you can directly call **`generate-subsets`** in the command line: - - ```console - generate-subsets [2,3,4] \ - [2], [2, 3], [2, 3, 4], [2, 4], [3], [3, 4], [4] - ``` - - This could be used in the command line in concert with other calls. For example, the following call: - - ```console - meds-tab-xgboost --multirun tabularization.window_sizes=$(generate-subsets [1d,2d,7d,full]) - ``` - - would resolve to: - - ```console - meds-tab-xgboost --multirun tabularization.window_sizes=[1d],[1d,2d],[1d,2d,7d],[1d,2d,7d,full],[1d,2d,full],[1d,7d],[1d,7d,full],[1d,full],[2d],[2d,7d],[2d,7d,full],[2d,full],[7d],[7d,full],[full] - ``` - - which can then be correctly interpreted by Hydra's multirun logic. - -## Roadmap - -MEDS-Tab has several key limitations which we plan to address in future changes. These include, and are tracked by, the following GitHub issues. - -### Improvements to the core tabularization - -1. Further memory and runtime improvements are possible: [#16](https://github.com/mmcdermott/MEDS_Tabular_AutoML/issues/16) -2. We should support additional window sizes and types: [#31](https://github.com/mmcdermott/MEDS_Tabular_AutoML/issues/31) -3. We should support additional aggregation functions: [#32](https://github.com/mmcdermott/MEDS_Tabular_AutoML/issues/32) - -### Improvements to the modeling pipeline - -1. We should likely decorrelate the default aggregations and/or window sizes we use prior to passing them into the models as features: [#27](https://github.com/mmcdermott/MEDS_Tabular_AutoML/issues/27) -2. We need to do a detailed parameter study over the hyperparameter sweep options to find good defaults for these kinds of problems and models: [#33](https://github.com/mmcdermott/MEDS_Tabular_AutoML/issues/33) -3. We should support a more extensive set of pipeline operations and model architectures: [#37](https://github.com/mmcdermott/MEDS_Tabular_AutoML/issues/37) - -### Technical debt / code improvements - -1. The computation and use of the code metadata dataframe, containing frequencies of codes, should be offloaded to core MEDS functionality, with the remaining code in this repository cleaned up. - - [#28](https://github.com/mmcdermott/MEDS_Tabular_AutoML/issues/28) - - [#14](https://github.com/mmcdermott/MEDS_Tabular_AutoML/issues/14) -2. We should add more doctests and push test coverage up to 100% - - [#29](https://github.com/mmcdermott/MEDS_Tabular_AutoML/issues/29) - - [#30](https://github.com/mmcdermott/MEDS_Tabular_AutoML/issues/30) -3. We need to ensure full and seamless compatibility with the ACES CLI tool, rather than relying on the python API and manual adjustments: - [#34](https://github.com/mmcdermott/MEDS_Tabular_AutoML/issues/34) - -## What do you mean "tabular pipelines"? Isn't _all_ structured EHR data already tabular? - -This is a common misconception. _Tabular_ data refers to data that can be organized in a consistent, logical -set of rows/columns such that the entirety of a "sample" or "instance" for modeling or analysis is contained -in a single row, and the set of columns possibly observed (there can be missingness) is consistent across all -rows. Structured EHR data does not satisfy this definition, as we will have different numbers of observations -of medical codes and values at different timestamps for different patients, so it cannot simultanesouly -satisfy the (1) "single row single instance", (2) "consistent set of columns", and (3) "logical" requirements. -Thus, in this pipeline, when we say we will produce a "tabular" view of MEDS data, we mean a dataset that can -realize these constraints, which will explicitly involve summarizing the patient data over various historical -or future windows in time to produce a single row per patient with a consistent, logical set of columns -(though there may still be missingness). diff --git a/docs/terminology.md b/docs/terminology.md new file mode 100644 index 0000000..49560cc --- /dev/null +++ b/docs/terminology.md @@ -0,0 +1,3 @@ +# Definitions for meds-tab terms + +Refer to the terms defined in the [official MEDS Schema](https://github.com/Medical-Event-Data-Standard/meds) and [MEDS_transforms](https://meds-transforms.readthedocs.io/en/latest/terminology/). diff --git a/docs/tutorial.md b/docs/tutorial.md new file mode 100644 index 0000000..4feae65 --- /dev/null +++ b/docs/tutorial.md @@ -0,0 +1 @@ +--8<-- "MIMICIV_TUTORIAL/README.MD" diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 0000000..62b33de --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,81 @@ +site_name: meds-tab +repo_url: https://github.com/mmcdermott/MEDS_Tabular_AutoML +site_description: Documentation for the meds-tab package +site_author: Nassim Oufattole + +nav: + - "Home": index.md + - "Overview": overview.md + - "MIMICIV Tutorial": tutorial.md + - "Terminology": terminology.md + - "Prediction": prediction.md + - "Profiling": profiling.md + - "API Reference": reference/api/ + - "Config Reference": reference/config/ + - "Issues": https://github.com/mmcdermott/MEDS_Tabular_AutoML/issues + +theme: + features: + - announce.dismiss + - content.action.edit + - content.action.view + - content.code.annotate + - content.code.copy + custom_dir: docs/overrides + logo: assets/light_logo.png + name: material + locale: en + palette: + - media: "(prefers-color-scheme)" + toggle: + icon: material/link + name: Switch to light mode + - media: "(prefers-color-scheme: light)" + scheme: default + primary: indigo + accent: indigo + toggle: + icon: material/toggle-switch + name: Switch to dark mode + - media: "(prefers-color-scheme: dark)" + scheme: slate + primary: black + accent: indigo + toggle: + icon: material/toggle-switch-off + name: Switch to system preference + font: + text: Roboto + code: Roboto Mono + favicon: assets/light_logo.png + +markdown_extensions: + - smarty + - pymdownx.arithmatex: + generic: true + - pymdownx.highlight: + anchor_linenums: true + - pymdownx.inlinehilite + - pymdownx.smartsymbols + - pymdownx.snippets + - pymdownx.tabbed: + alternate_style: true + - pymdownx.superfences + - admonition + - pymdownx.details + +extra_javascript: + - javascripts/mathjax.js + - https://unpkg.com/mathjax@3/es5/tex-mml-chtml.js + +plugins: + - search + - gen-files: + scripts: + - docs/gen_ref_pages.py + - literate-nav: + nav_file: SUMMARY.md + - section-index + - mkdocstrings + - git-authors + - git-revision-date-localized diff --git a/pyproject.toml b/pyproject.toml index a30e14b..670abee 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,10 +33,24 @@ generate-subsets = "MEDS_tabular_automl.scripts.generate_subsets:main" [project.optional-dependencies] -dev = ["pre-commit"] +dev = ["pre-commit<4"] tests = ["pytest", "pytest-cov", "rootutils"] profiling = ["mprofile", "matplotlib"] autogluon = ["autogluon; python_version=='3.11.*'"] # Environment marker to restrict AutoGluon to Python 3.11 +docs = [ + "mkdocs==1.6.0", + "mkdocs-gen-files==0.5.0", + "mkdocs-get-deps==0.2.0", + "mkdocs-git-authors-plugin==0.9.0", + "mkdocs-git-revision-date-localized-plugin==1.2.7", + "mkdocs-literate-nav==0.6.1", + "mkdocs-material==9.5.33", + "mkdocs-material-extensions==1.3.1", + "mkdocs-section-index==0.3.9", + "mkdocs-snippets==1.3.0", + "mkdocstrings==0.25.2", + "mkdocstrings-python==1.10.8" +] [build-system] requires = ["setuptools>=61.0", "setuptools-scm>=8.0", "wheel"]