diff --git a/.gitignore b/.gitignore index 68bc17f..57eed8c 100644 --- a/.gitignore +++ b/.gitignore @@ -158,3 +158,5 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ + +docs/source/generated diff --git a/README.md b/README.md index 9575b17..1ce978f 100644 --- a/README.md +++ b/README.md @@ -46,7 +46,7 @@ This repository consists of two key pieces: what is more advanced is the efficient construction, storage, and loading of tabular features for the candidate AutoML models, enabling a far more extensive search over different featurization strategies. -### Scripts and Examples +## Scripts and Examples See `tests/test_integration.py` for an example of the end-to-end pipeline being run on synthetic data. This script is a functional test that is also run with `pytest` to verify the correctness of the algorithm. @@ -54,7 +54,7 @@ script is a functional test that is also run with `pytest` to verify the correct For an end to end example over MIMIC-IV, see the [companion repository](https://github.com/mmcdermott/MEDS_TAB_MIMIC_IV) For an end to end example over Philips eICU, see the [eICU companion repository](https://github.com/mmcdermott/MEDS_TAB_EICU). -### Core CLI Scripts Overview +## Core CLI Scripts Overview 1. **`meds-tab-describe`**: This command processes MEDS data shards to compute the frequencies of different code-types @@ -124,7 +124,7 @@ For an end to end example over Philips eICU, see the [eICU companion repository] 6. **`meds-tab-xgboost-sweep`**: Conducts an Optuna hyperparameter sweep to optimize over `window_sizes`, `aggregations`, and `min_code_inclusion_frequency`, aiming to enhance model performance and adaptability. -### Additional CLI Scripts +## Additional CLI Scripts 1. **`generate-permutations`**: Generates and prints a sorted list of all permutations from a comma separated input. This is provided for the convenience of sweeping over all possible combinations of window sizes and aggregations. @@ -149,7 +149,7 @@ For an end to end example over Philips eICU, see the [eICU companion repository] # How does MEDS-Tab Work? -#### What do you mean "tabular pipelines"? Isn't _all_ structured EHR data already tabular? +## What do you mean "tabular pipelines"? Isn't _all_ structured EHR data already tabular? This is a common misconception. _Tabular_ data refers to data that can be organized in a consistent, logical set of rows/columns such that the entirety of a "sample" or "instance" for modeling or analysis is contained diff --git a/docs/source/conf.py b/docs/source/conf.py index 34fb932..6cf386b 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -1,4 +1,5 @@ import os +import shutil import sys # Configuration file for the Sphinx documentation builder. @@ -10,15 +11,24 @@ # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information project = "MEDS-Tab" -copyright = "2024, Matthew McDermott, Nassim Oufattole, Teya Bergamaschi" -author = "Matthew McDermott, Nassim Oufattole, Teya Bergamaschi" -release = "0.0.1" -version = "0.0.1" +copyright = "2024, Nassim Oufattole, Matthew McDermott, Teya Bergamaschi, Aleksia Kolo, Hyewon Jeong" +author = "Nassim Oufattole, Matthew McDermott, Teya Bergamaschi, Aleksia Kolo, Hyewon Jeong" +release = "0.0.2" +version = "0.0.2" # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration -sys.path.insert(0, os.path.abspath("../..")) +# -- Path setup +from pathlib import Path + +__location__ = Path(os.path.dirname(__file__)) +__src__ = __location__ / "../.." + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +sys.path.insert(0, str(__src__)) extensions = [ "sphinx.ext.duration", @@ -38,6 +48,24 @@ ".md": "markdown", } +# -- Run sphinx-apidoc +# This ensures we don't need to run apidoc manually. + +# TODO: use https://github.com/sphinx-extensions2/sphinx-autodoc2 + +from sphinx.ext import apidoc + +output_dir = __location__ / "generated" +module_dir = __src__ / "src/MEDS_tabular_automl" +if output_dir.is_dir(): + shutil.rmtree(output_dir) + +try: + cmd_line = f"--implicit-namespaces -e -f -o {output_dir} {module_dir}" + apidoc.main(cmd_line.split(" ")) +except Exception as e: # pylint: disable=broad-except + print(f"Running `sphinx-apidoc {cmd_line}` failed!\n{e}") + intersphinx_mapping = { "python": ("https://docs.python.org/3/", None), "sphinx": ("https://www.sphinx-doc.org/en/master/", None), diff --git a/docs/source/generated/src.MEDS_tabular_automl.configs.rst b/docs/source/generated/src.MEDS_tabular_automl.configs.rst deleted file mode 100644 index ea0b463..0000000 --- a/docs/source/generated/src.MEDS_tabular_automl.configs.rst +++ /dev/null @@ -1,30 +0,0 @@ -src.MEDS\_tabular\_automl.configs -================================= - -.. automodule:: src.MEDS_tabular_automl.configs - - - - - - - - - - - - - - - - - - - -.. rubric:: Modules - -.. autosummary:: - :toctree: - :recursive: - - src.MEDS_tabular_automl.configs.tabularization diff --git a/docs/source/generated/src.MEDS_tabular_automl.configs.tabularization.rst b/docs/source/generated/src.MEDS_tabular_automl.configs.tabularization.rst deleted file mode 100644 index 060282a..0000000 --- a/docs/source/generated/src.MEDS_tabular_automl.configs.tabularization.rst +++ /dev/null @@ -1,4 +0,0 @@ -src.MEDS\_tabular\_automl.configs.tabularization -================================================ - -.. automodule:: src.MEDS_tabular_automl.configs.tabularization diff --git a/docs/source/generated/src.MEDS_tabular_automl.describe_codes.rst b/docs/source/generated/src.MEDS_tabular_automl.describe_codes.rst deleted file mode 100644 index cbf802b..0000000 --- a/docs/source/generated/src.MEDS_tabular_automl.describe_codes.rst +++ /dev/null @@ -1,23 +0,0 @@ -src.MEDS\_tabular\_automl.describe\_codes -========================================= - -.. automodule:: src.MEDS_tabular_automl.describe_codes - - - - - - - - .. rubric:: Functions - - .. autosummary:: - - clear_code_aggregation_suffix - compute_feature_frequencies - convert_to_df - convert_to_freq_dict - filter_parquet - filter_to_codes - get_feature_columns - get_feature_freqs diff --git a/docs/source/generated/src.MEDS_tabular_automl.file_name.rst b/docs/source/generated/src.MEDS_tabular_automl.file_name.rst deleted file mode 100644 index 3b75288..0000000 --- a/docs/source/generated/src.MEDS_tabular_automl.file_name.rst +++ /dev/null @@ -1,18 +0,0 @@ -src.MEDS\_tabular\_automl.file\_name -==================================== - -.. automodule:: src.MEDS_tabular_automl.file_name - - - - - - - - .. rubric:: Functions - - .. autosummary:: - - get_model_files - get_task_specific_path - list_subdir_files diff --git a/docs/source/generated/src.MEDS_tabular_automl.generate_static_features.rst b/docs/source/generated/src.MEDS_tabular_automl.generate_static_features.rst deleted file mode 100644 index 7656f7f..0000000 --- a/docs/source/generated/src.MEDS_tabular_automl.generate_static_features.rst +++ /dev/null @@ -1,19 +0,0 @@ -src.MEDS\_tabular\_automl.generate\_static\_features -==================================================== - -.. automodule:: src.MEDS_tabular_automl.generate_static_features - - - - - - - - .. rubric:: Functions - - .. autosummary:: - - convert_to_matrix - get_flat_static_rep - get_sparse_static_rep - summarize_static_measurements diff --git a/docs/source/generated/src.MEDS_tabular_automl.generate_summarized_reps.rst b/docs/source/generated/src.MEDS_tabular_automl.generate_summarized_reps.rst deleted file mode 100644 index b9f44d7..0000000 --- a/docs/source/generated/src.MEDS_tabular_automl.generate_summarized_reps.rst +++ /dev/null @@ -1,20 +0,0 @@ -src.MEDS\_tabular\_automl.generate\_summarized\_reps -==================================================== - -.. automodule:: src.MEDS_tabular_automl.generate_summarized_reps - - - - - - - - .. rubric:: Functions - - .. autosummary:: - - aggregate_matrix - compute_agg - generate_summary - get_rolling_window_indicies - sparse_aggregate diff --git a/docs/source/generated/src.MEDS_tabular_automl.generate_ts_features.rst b/docs/source/generated/src.MEDS_tabular_automl.generate_ts_features.rst deleted file mode 100644 index e4fdd21..0000000 --- a/docs/source/generated/src.MEDS_tabular_automl.generate_ts_features.rst +++ /dev/null @@ -1,20 +0,0 @@ -src.MEDS\_tabular\_automl.generate\_ts\_features -================================================ - -.. automodule:: src.MEDS_tabular_automl.generate_ts_features - - - - - - - - .. rubric:: Functions - - .. autosummary:: - - feature_name_to_code - get_flat_ts_rep - get_long_code_df - get_long_value_df - summarize_dynamic_measurements diff --git a/docs/source/generated/src.MEDS_tabular_automl.mapper.rst b/docs/source/generated/src.MEDS_tabular_automl.mapper.rst deleted file mode 100644 index a0a05ea..0000000 --- a/docs/source/generated/src.MEDS_tabular_automl.mapper.rst +++ /dev/null @@ -1,18 +0,0 @@ -src.MEDS\_tabular\_automl.mapper -================================ - -.. automodule:: src.MEDS_tabular_automl.mapper - - - - - - - - .. rubric:: Functions - - .. autosummary:: - - get_earliest_lock - register_lock - wrap diff --git a/docs/source/generated/src.MEDS_tabular_automl.rst b/docs/source/generated/src.MEDS_tabular_automl.rst deleted file mode 100644 index 2fd4a60..0000000 --- a/docs/source/generated/src.MEDS_tabular_automl.rst +++ /dev/null @@ -1,38 +0,0 @@ -src.MEDS\_tabular\_automl -========================= - -.. automodule:: src.MEDS_tabular_automl - - - - - - - - - - - - - - - - - - - -.. rubric:: Modules - -.. autosummary:: - :toctree: - :recursive: - - src.MEDS_tabular_automl.configs - src.MEDS_tabular_automl.describe_codes - src.MEDS_tabular_automl.file_name - src.MEDS_tabular_automl.generate_static_features - src.MEDS_tabular_automl.generate_summarized_reps - src.MEDS_tabular_automl.generate_ts_features - src.MEDS_tabular_automl.mapper - src.MEDS_tabular_automl.scripts - src.MEDS_tabular_automl.utils diff --git a/docs/source/generated/src.MEDS_tabular_automl.scripts.cache_task.rst b/docs/source/generated/src.MEDS_tabular_automl.scripts.cache_task.rst deleted file mode 100644 index 76d5c8c..0000000 --- a/docs/source/generated/src.MEDS_tabular_automl.scripts.cache_task.rst +++ /dev/null @@ -1,17 +0,0 @@ -src.MEDS\_tabular\_automl.scripts.cache\_task -============================================= - -.. automodule:: src.MEDS_tabular_automl.scripts.cache_task - - - - - - - - .. rubric:: Functions - - .. autosummary:: - - generate_row_cached_matrix - main diff --git a/docs/source/generated/src.MEDS_tabular_automl.scripts.describe_codes.rst b/docs/source/generated/src.MEDS_tabular_automl.scripts.describe_codes.rst deleted file mode 100644 index 7b604e6..0000000 --- a/docs/source/generated/src.MEDS_tabular_automl.scripts.describe_codes.rst +++ /dev/null @@ -1,16 +0,0 @@ -src.MEDS\_tabular\_automl.scripts.describe\_codes -================================================= - -.. automodule:: src.MEDS_tabular_automl.scripts.describe_codes - - - - - - - - .. rubric:: Functions - - .. autosummary:: - - main diff --git a/docs/source/generated/src.MEDS_tabular_automl.scripts.launch_xgboost.rst b/docs/source/generated/src.MEDS_tabular_automl.scripts.launch_xgboost.rst deleted file mode 100644 index 61afb61..0000000 --- a/docs/source/generated/src.MEDS_tabular_automl.scripts.launch_xgboost.rst +++ /dev/null @@ -1,27 +0,0 @@ -src.MEDS\_tabular\_automl.scripts.launch\_xgboost -================================================= - -.. automodule:: src.MEDS_tabular_automl.scripts.launch_xgboost - - - - - - - - .. rubric:: Functions - - .. autosummary:: - - main - - - - - - .. rubric:: Classes - - .. autosummary:: - - Iterator - XGBoostModel diff --git a/docs/source/generated/src.MEDS_tabular_automl.scripts.rst b/docs/source/generated/src.MEDS_tabular_automl.scripts.rst deleted file mode 100644 index 15ca299..0000000 --- a/docs/source/generated/src.MEDS_tabular_automl.scripts.rst +++ /dev/null @@ -1,34 +0,0 @@ -src.MEDS\_tabular\_automl.scripts -================================= - -.. automodule:: src.MEDS_tabular_automl.scripts - - - - - - - - - - - - - - - - - - - -.. rubric:: Modules - -.. autosummary:: - :toctree: - :recursive: - - src.MEDS_tabular_automl.scripts.cache_task - src.MEDS_tabular_automl.scripts.describe_codes - src.MEDS_tabular_automl.scripts.launch_xgboost - src.MEDS_tabular_automl.scripts.tabularize_static - src.MEDS_tabular_automl.scripts.tabularize_time_series diff --git a/docs/source/generated/src.MEDS_tabular_automl.scripts.tabularize_static.rst b/docs/source/generated/src.MEDS_tabular_automl.scripts.tabularize_static.rst deleted file mode 100644 index 61852e0..0000000 --- a/docs/source/generated/src.MEDS_tabular_automl.scripts.tabularize_static.rst +++ /dev/null @@ -1,16 +0,0 @@ -src.MEDS\_tabular\_automl.scripts.tabularize\_static -==================================================== - -.. automodule:: src.MEDS_tabular_automl.scripts.tabularize_static - - - - - - - - .. rubric:: Functions - - .. autosummary:: - - main diff --git a/docs/source/generated/src.MEDS_tabular_automl.scripts.tabularize_time_series.rst b/docs/source/generated/src.MEDS_tabular_automl.scripts.tabularize_time_series.rst deleted file mode 100644 index 066a968..0000000 --- a/docs/source/generated/src.MEDS_tabular_automl.scripts.tabularize_time_series.rst +++ /dev/null @@ -1,16 +0,0 @@ -src.MEDS\_tabular\_automl.scripts.tabularize\_time\_series -========================================================== - -.. automodule:: src.MEDS_tabular_automl.scripts.tabularize_time_series - - - - - - - - .. rubric:: Functions - - .. autosummary:: - - main diff --git a/docs/source/generated/src.MEDS_tabular_automl.utils.rst b/docs/source/generated/src.MEDS_tabular_automl.utils.rst deleted file mode 100644 index ba45d9c..0000000 --- a/docs/source/generated/src.MEDS_tabular_automl.utils.rst +++ /dev/null @@ -1,37 +0,0 @@ -src.MEDS\_tabular\_automl.utils -=============================== - -.. automodule:: src.MEDS_tabular_automl.utils - - - - - - - - .. rubric:: Functions - - .. autosummary:: - - add_static_missing_cols - array_to_sparse_matrix - get_events_df - get_feature_indices - get_feature_names - get_flat_rep_feature_cols - get_min_dtype - get_prediction_ts_cols - get_shard_prefix - get_static_col_dtype - get_static_feature_cols - get_ts_feature_cols - get_unique_time_events_df - hydra_loguru_init - load_matrix - load_meds_data - load_tqdm - parse_static_feature_column - sparse_matrix_to_array - store_config_yaml - store_matrix - write_df diff --git a/docs/source/generated/src.rst b/docs/source/generated/src.rst deleted file mode 100644 index b9f0096..0000000 --- a/docs/source/generated/src.rst +++ /dev/null @@ -1,30 +0,0 @@ -src -=== - -.. automodule:: src - - - - - - - - - - - - - - - - - - - -.. rubric:: Modules - -.. autosummary:: - :toctree: - :recursive: - - src.MEDS_tabular_automl diff --git a/src/MEDS_tabular_automl/scripts/cache_task.py b/src/MEDS_tabular_automl/scripts/cache_task.py index b42e765..62df9b8 100644 --- a/src/MEDS_tabular_automl/scripts/cache_task.py +++ b/src/MEDS_tabular_automl/scripts/cache_task.py @@ -10,9 +10,9 @@ import scipy.sparse as sp from omegaconf import DictConfig -from MEDS_tabular_automl.file_name import list_subdir_files -from MEDS_tabular_automl.mapper import wrap as rwlock_wrap -from MEDS_tabular_automl.utils import ( +from ..file_name import list_subdir_files +from ..mapper import wrap as rwlock_wrap +from ..utils import ( CODE_AGGREGATIONS, STATIC_CODE_AGGREGATION, STATIC_VALUE_AGGREGATION, diff --git a/src/MEDS_tabular_automl/scripts/describe_codes.py b/src/MEDS_tabular_automl/scripts/describe_codes.py index 408980f..fdee111 100644 --- a/src/MEDS_tabular_automl/scripts/describe_codes.py +++ b/src/MEDS_tabular_automl/scripts/describe_codes.py @@ -10,19 +10,14 @@ from loguru import logger from omegaconf import DictConfig, OmegaConf -from MEDS_tabular_automl.describe_codes import ( +from ..describe_codes import ( compute_feature_frequencies, convert_to_df, convert_to_freq_dict, ) -from MEDS_tabular_automl.file_name import list_subdir_files -from MEDS_tabular_automl.mapper import wrap as rwlock_wrap -from MEDS_tabular_automl.utils import ( - get_shard_prefix, - hydra_loguru_init, - load_tqdm, - write_df, -) +from ..file_name import list_subdir_files +from ..mapper import wrap as rwlock_wrap +from ..utils import get_shard_prefix, hydra_loguru_init, load_tqdm, write_df config_yaml = files("MEDS_tabular_automl").joinpath("configs/describe_codes.yaml") if not config_yaml.is_file(): diff --git a/src/MEDS_tabular_automl/scripts/launch_xgboost.py b/src/MEDS_tabular_automl/scripts/launch_xgboost.py index 9089bd5..46233c2 100644 --- a/src/MEDS_tabular_automl/scripts/launch_xgboost.py +++ b/src/MEDS_tabular_automl/scripts/launch_xgboost.py @@ -13,9 +13,9 @@ from omegaconf import DictConfig, OmegaConf from sklearn.metrics import roc_auc_score -from MEDS_tabular_automl.describe_codes import get_feature_columns -from MEDS_tabular_automl.file_name import get_model_files, list_subdir_files -from MEDS_tabular_automl.utils import get_feature_indices, hydra_loguru_init +from ..describe_codes import get_feature_columns +from ..file_name import get_model_files, list_subdir_files +from ..utils import get_feature_indices, hydra_loguru_init config_yaml = files("MEDS_tabular_automl").joinpath("configs/launch_xgboost.yaml") if not config_yaml.is_file(): diff --git a/src/MEDS_tabular_automl/scripts/tabularize_static.py b/src/MEDS_tabular_automl/scripts/tabularize_static.py index d7434f0..2474442 100644 --- a/src/MEDS_tabular_automl/scripts/tabularize_static.py +++ b/src/MEDS_tabular_automl/scripts/tabularize_static.py @@ -14,16 +14,16 @@ from omegaconf import DictConfig -from MEDS_tabular_automl.describe_codes import ( +from ..describe_codes import ( convert_to_df, filter_parquet, get_feature_columns, get_feature_freqs, ) -from MEDS_tabular_automl.file_name import list_subdir_files -from MEDS_tabular_automl.generate_static_features import get_flat_static_rep -from MEDS_tabular_automl.mapper import wrap as rwlock_wrap -from MEDS_tabular_automl.utils import ( +from ..file_name import list_subdir_files +from ..generate_static_features import get_flat_static_rep +from ..mapper import wrap as rwlock_wrap +from ..utils import ( STATIC_CODE_AGGREGATION, STATIC_VALUE_AGGREGATION, filter_to_codes, diff --git a/src/MEDS_tabular_automl/scripts/tabularize_time_series.py b/src/MEDS_tabular_automl/scripts/tabularize_time_series.py index c48e59a..c6ecc98 100644 --- a/src/MEDS_tabular_automl/scripts/tabularize_time_series.py +++ b/src/MEDS_tabular_automl/scripts/tabularize_time_series.py @@ -15,12 +15,12 @@ from loguru import logger from omegaconf import DictConfig -from MEDS_tabular_automl.describe_codes import filter_parquet, get_feature_columns -from MEDS_tabular_automl.file_name import list_subdir_files -from MEDS_tabular_automl.generate_summarized_reps import generate_summary -from MEDS_tabular_automl.generate_ts_features import get_flat_ts_rep -from MEDS_tabular_automl.mapper import wrap as rwlock_wrap -from MEDS_tabular_automl.utils import ( +from ..describe_codes import filter_parquet, get_feature_columns +from ..file_name import list_subdir_files +from ..generate_summarized_reps import generate_summary +from ..generate_ts_features import get_flat_ts_rep +from ..mapper import wrap as rwlock_wrap +from ..utils import ( STATIC_CODE_AGGREGATION, STATIC_VALUE_AGGREGATION, get_shard_prefix, diff --git a/src/MEDS_tabular_automl/utils.py b/src/MEDS_tabular_automl/utils.py index 9527aea..b4e9e81 100644 --- a/src/MEDS_tabular_automl/utils.py +++ b/src/MEDS_tabular_automl/utils.py @@ -78,7 +78,7 @@ def filter_to_codes( return sorted(feature_freqs["code"].to_list()) -OmegaConf.register_new_resolver("filter_to_codes", filter_to_codes) +OmegaConf.register_new_resolver("filter_to_codes", filter_to_codes, replace=True) def load_tqdm(use_tqdm: bool):