diff --git a/.github/workflows/publish-to-pypi.yml b/.github/workflows/publish-to-pypi.yml index d86806f..7a68958 100644 --- a/.github/workflows/publish-to-pypi.yml +++ b/.github/workflows/publish-to-pypi.yml @@ -36,7 +36,7 @@ jobs: runs-on: ubuntu-latest environment: name: pypi - url: https://pypi.org/p/ # Replace with your PyPI project name + url: https://pypi.org/p/meds-tab # Replace with your PyPI project name permissions: id-token: write # IMPORTANT: mandatory for trusted publishing @@ -91,27 +91,3 @@ jobs: gh release upload '${{ github.ref_name }}' dist/** --repo '${{ github.repository }}' - - publish-to-testpypi: - name: Publish Python 🐍 distribution 📦 to TestPyPI - needs: - - build - runs-on: ubuntu-latest - - environment: - name: testpypi - url: https://test.pypi.org/p/ - - permissions: - id-token: write # IMPORTANT: mandatory for trusted publishing - - steps: - - name: Download all the dists - uses: actions/download-artifact@v3 - with: - name: python-package-distributions - path: dist/ - - name: Publish distribution 📦 to TestPyPI - uses: pypa/gh-action-pypi-publish@release/v1 - with: - repository-url: https://test.pypi.org/legacy/ diff --git a/README.md b/README.md index 8900e41..ac704f8 100644 --- a/README.md +++ b/README.md @@ -84,12 +84,12 @@ By following these steps, you can seamlessly transform your dataset, define nece ```console # Re-shard pipeline - # $MIMICIV_MEDS_DIR is the directory containing the input, MEDS v0.3 formatted MIMIC-IV data + # $MIMICIV_input_dir is the directory containing the input, MEDS v0.3 formatted MIMIC-IV data # $MEDS_TAB_COHORT_DIR is the directory where the re-sharded MEDS dataset will be stored, and where your model # will store cached files during processing by default. # $N_PATIENTS_PER_SHARD is the number of patients per shard you want to use. MEDS_transform-reshard_to_split \ - input_dir="$MIMICIV_MEDS_DIR" \ + input_dir="$MIMICIV_input_dir" \ cohort_dir="$MEDS_TAB_COHORT_DIR" \ 'stages=["reshard_to_split"]' \ stage="reshard_to_split" \ @@ -103,14 +103,14 @@ By following these steps, you can seamlessly transform your dataset, define nece - static codes (codes without timestamps) - static numerical codes (codes without timestamps but with numerical values). - This script further caches feature names and frequencies in a dataset stored in a `code_metadata.parquet` file within the `MEDS_cohort_dir` argument specified as a hydra-style command line argument. + This script further caches feature names and frequencies in a dataset stored in a `code_metadata.parquet` file within the `input_dir` argument specified as a hydra-style command line argument. 2. **`meds-tab-tabularize-static`**: Filters and processes the dataset based on the frequency of codes, generating a tabular vector for each patient at each timestamp in the shards. Each row corresponds to a unique `subject_id` and `timestamp` combination, thus rows are duplicated across multiple timestamps for the same patient. **Example: Tabularizing static data** with the minimum code frequency of 10, window sizes of `[1d, 30d, 365d, full]`, and value aggregation methods of `[static/present, static/first, code/count, value/count, value/sum, value/sum_sqd, value/min, value/max]` ```console - meds-tab-tabularize-static MEDS_cohort_dir="path_to_data" \ + meds-tab-tabularize-static input_dir="path_to_data" \ tabularization.min_code_inclusion_frequency=10 \ tabularization.window_sizes=[1d,30d,365d,full] \ do_overwrite=False \ @@ -127,19 +127,19 @@ By following these steps, you can seamlessly transform your dataset, define nece meds-tab-tabularize-time-series --multirun \ worker="range(0,$N_PARALLEL_WORKERS)" \ hydra/launcher=joblib \ - MEDS_cohort_dir="path_to_data" \ + input_dir="path_to_data" \ tabularization.min_code_inclusion_frequency=10 \ do_overwrite=False \ tabularization.window_sizes=[1d,30d,365d,full] \ tabularization.aggs=[static/present,static/first,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max] ``` -4. **`meds-tab-cache-task`**: Aligns task-specific labels with the nearest prior event in the tabularized data. It requires a labeled dataset directory with three columns (`subject_id`, `timestamp`, `label`) structured similarly to the `MEDS_cohort_dir`. +4. **`meds-tab-cache-task`**: Aligns task-specific labels with the nearest prior event in the tabularized data. It requires a labeled dataset directory with three columns (`subject_id`, `timestamp`, `label`) structured similarly to the `input_dir`. **Example: Align tabularized data** for a specific task `$TASK` and labels that has pulled from [ACES](https://github.com/justin13601/ACES) ```console - meds-tab-cache-task MEDS_cohort_dir="path_to_data" \ + meds-tab-cache-task input_dir="path_to_data" \ task_name=$TASK \ tabularization.min_code_inclusion_frequency=10 \ do_overwrite=False \ @@ -151,7 +151,7 @@ By following these steps, you can seamlessly transform your dataset, define nece ```console meds-tab-xgboost --multirun \ - MEDS_cohort_dir="path_to_data" \ + input_dir="path_to_data" \ task_name=$TASK \ output_dir="output_directory" \ tabularization.min_code_inclusion_frequency=10 \ @@ -436,7 +436,7 @@ A single XGBoost run was completed to profile time and memory usage. This was do ```console meds-tab-xgboost - MEDS_cohort_dir="path_to_data" \ + input_dir="path_to_data" \ task_name=$TASK \ output_dir="output_directory" \ do_overwrite=False \ @@ -506,7 +506,7 @@ The XGBoost sweep was run using the following command for each `$TASK`: ```console meds-tab-xgboost --multirun \ - MEDS_cohort_dir="path_to_data" \ + input_dir="path_to_data" \ task_name=$TASK \ output_dir="output_directory" \ tabularization.window_sizes=$(generate-subsets [1d,30d,365d,full]) \ @@ -529,14 +529,14 @@ The hydra sweeper swept over the parameters: ```yaml params: - +model_params.model.eta: tag(log, interval(0.001, 1)) - +model_params.model.lambda: tag(log, interval(0.001, 1)) - +model_params.model.alpha: tag(log, interval(0.001, 1)) - +model_params.model.subsample: interval(0.5, 1) - +model_params.model.min_child_weight: interval(1e-2, 100) - +model_params.model.max_depth: range(2, 16) - model_params.num_boost_round: range(100, 1000) - model_params.early_stopping_rounds: range(1, 10) + model.eta: tag(log, interval(0.001, 1)) + model.lambda: tag(log, interval(0.001, 1)) + model.alpha: tag(log, interval(0.001, 1)) + model.subsample: interval(0.5, 1) + model.min_child_weight: interval(1e-2, 100) + model.max_depth: range(2, 16) + num_boost_round: range(100, 1000) + early_stopping_rounds: range(1, 10) tabularization.min_code_inclusion_frequency: tag(log, range(10, 1000000)) ``` diff --git a/docs/source/overview.md b/docs/source/overview.md index 44f68bf..1d453f0 100644 --- a/docs/source/overview.md +++ b/docs/source/overview.md @@ -38,14 +38,14 @@ See [`/tests/test_integration.py`](https://github.com/mmcdermott/MEDS_Tabular_Au - static codes (codes without timestamps) - static numerical codes (codes without timestamps but with numerical values). - This script further caches feature names and frequencies in a dataset stored in a `code_metadata.parquet` file within the `MEDS_cohort_dir` argument specified as a hydra-style command line argument. + This script further caches feature names and frequencies in a dataset stored in a `code_metadata.parquet` file within the `input_dir` argument specified as a hydra-style command line argument. 2. **`meds-tab-tabularize-static`**: Filters and processes the dataset based on the frequency of codes, generating a tabular vector for each patient at each timestamp in the shards. Each row corresponds to a unique `subject_id` and `timestamp` combination, thus rows are duplicated across multiple timestamps for the same patient. **Example: Tabularizing static data** with the minimum code frequency of 10, window sizes of `[1d, 30d, 365d, full]`, and value aggregation methods of `[static/present, static/first, code/count, value/count, value/sum, value/sum_sqd, value/min, value/max]` ```console - meds-tab-tabularize-static MEDS_cohort_dir="path_to_data" \ + meds-tab-tabularize-static input_dir="path_to_data" \ tabularization.min_code_inclusion_frequency=10 \ tabularization.window_sizes=[1d,30d,365d,full] \ do_overwrite=False \ @@ -62,19 +62,19 @@ See [`/tests/test_integration.py`](https://github.com/mmcdermott/MEDS_Tabular_Au meds-tab-tabularize-time-series --multirun \ worker="range(0,$N_PARALLEL_WORKERS)" \ hydra/launcher=joblib \ - MEDS_cohort_dir="path_to_data" \ + input_dir="path_to_data" \ tabularization.min_code_inclusion_frequency=10 \ do_overwrite=False \ tabularization.window_sizes=[1d,30d,365d,full] \ tabularization.aggs=[static/present,static/first,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max] ``` -4. **`meds-tab-cache-task`**: Aligns task-specific labels with the nearest prior event in the tabularized data. It requires a labeled dataset directory with three columns (`subject_id`, `timestamp`, `label`) structured similarly to the `MEDS_cohort_dir`. +4. **`meds-tab-cache-task`**: Aligns task-specific labels with the nearest prior event in the tabularized data. It requires a labeled dataset directory with three columns (`subject_id`, `timestamp`, `label`) structured similarly to the `input_dir`. **Example: Align tabularized data** for a specific task `$TASK` and labels that has pulled from [ACES](https://github.com/justin13601/ACES) ```console - meds-tab-cache-task MEDS_cohort_dir="path_to_data" \ + meds-tab-cache-task input_dir="path_to_data" \ task_name=$TASK \ tabularization.min_code_inclusion_frequency=10 \ do_overwrite=False \ @@ -86,7 +86,7 @@ See [`/tests/test_integration.py`](https://github.com/mmcdermott/MEDS_Tabular_Au ```console meds-tab-xgboost --multirun \ - MEDS_cohort_dir="path_to_data" \ + input_dir="path_to_data" \ task_name=$TASK \ output_dir="output_directory" \ tabularization.min_code_inclusion_frequency=10 \ diff --git a/docs/source/prediction.md b/docs/source/prediction.md index 35131d8..18f19c0 100644 --- a/docs/source/prediction.md +++ b/docs/source/prediction.md @@ -14,7 +14,7 @@ A single XGBoost run was completed to profile time and memory usage. This was do ```console meds-tab-xgboost - MEDS_cohort_dir="path_to_data" \ + input_dir="path_to_data" \ task_name=$TASK \ output_dir="output_directory" \ do_overwrite=False \ @@ -84,7 +84,7 @@ The XGBoost sweep was run using the following command for each `$TASK`: ```console meds-tab-xgboost --multirun \ - MEDS_cohort_dir="path_to_data" \ + input_dir="path_to_data" \ task_name=$TASK \ output_dir="output_directory" \ tabularization.window_sizes=$(generate-permutations [1d,30d,365d,full]) \ @@ -107,14 +107,14 @@ The hydra sweeper swept over the parameters: ```yaml params: - +model_params.model.eta: tag(log, interval(0.001, 1)) - +model_params.model.lambda: tag(log, interval(0.001, 1)) - +model_params.model.alpha: tag(log, interval(0.001, 1)) - +model_params.model.subsample: interval(0.5, 1) - +model_params.model.min_child_weight: interval(1e-2, 100) - +model_params.model.max_depth: range(2, 16) - model_params.num_boost_round: range(100, 1000) - model_params.early_stopping_rounds: range(1, 10) + model.eta: tag(log, interval(0.001, 1)) + model.lambda: tag(log, interval(0.001, 1)) + model.alpha: tag(log, interval(0.001, 1)) + model.subsample: interval(0.5, 1) + model.min_child_weight: interval(1e-2, 100) + model.max_depth: range(2, 16) + num_boost_round: range(100, 1000) + early_stopping_rounds: range(1, 10) tabularization.min_code_inclusion_frequency: tag(log, range(10, 1000000)) ``` diff --git a/pyproject.toml b/pyproject.toml index 59b30cf..1b75489 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,6 +19,8 @@ dependencies = [ "scikit-learn", "hydra-optuna-sweeper", "hydra-joblib-launcher", "ml-mixins", "meds==0.3.3", "meds-transforms==0.0.7", ] +[tool.setuptools_scm] + [project.scripts] meds-tab-describe = "MEDS_tabular_automl.scripts.describe_codes:main" meds-tab-tabularize-static = "MEDS_tabular_automl.scripts.tabularize_static:main" diff --git a/src/MEDS_tabular_automl/configs/default.yaml b/src/MEDS_tabular_automl/configs/default.yaml index 82a2164..7d4e392 100644 --- a/src/MEDS_tabular_automl/configs/default.yaml +++ b/src/MEDS_tabular_automl/configs/default.yaml @@ -1,13 +1,13 @@ -MEDS_cohort_dir: ??? -output_cohort_dir: ??? +input_dir: ??? +output_dir: ??? do_overwrite: False seed: 1 tqdm: False worker: 0 loguru_init: False -log_dir: ${output_cohort_dir}/.logs/ -cache_dir: ${output_cohort_dir}/.cache +log_dir: ${output_dir}/.logs/ +cache_dir: ${output_dir}/.cache hydra: verbose: False diff --git a/src/MEDS_tabular_automl/configs/describe_codes.yaml b/src/MEDS_tabular_automl/configs/describe_codes.yaml index ec980bf..007307c 100644 --- a/src/MEDS_tabular_automl/configs/describe_codes.yaml +++ b/src/MEDS_tabular_automl/configs/describe_codes.yaml @@ -2,8 +2,7 @@ defaults: - default - _self_ -input_dir: ${output_cohort_dir}/data # Where to store output code frequency data -output_filepath: ${output_cohort_dir}/metadata/codes.parquet +output_filepath: ${output_dir}/metadata/codes.parquet name: describe_codes diff --git a/src/MEDS_tabular_automl/configs/launch_autogluon.yaml b/src/MEDS_tabular_automl/configs/launch_autogluon.yaml deleted file mode 100644 index 908e79d..0000000 --- a/src/MEDS_tabular_automl/configs/launch_autogluon.yaml +++ /dev/null @@ -1,28 +0,0 @@ -defaults: - - default - - tabularization: default - - imputer: default - - normalization: default - - _self_ - -task_name: task - -# Task cached data dir -input_dir: ${output_cohort_dir}/${task_name}/task_cache -# Directory with task labels -input_label_dir: ${output_cohort_dir}/${task_name}/labels/ -# Where to output the model and cached data -model_dir: ${output_cohort_dir}/autogluon/autogluon_${now:%Y-%m-%d_%H-%M-%S} -model_log_dir: ${model_dir}/.logs/ -output_filepath: ${model_dir} - -# Model parameters -model_params: - iterator: - keep_data_in_memory: True - binarize_task: True - -log_dir: ${model_dir}/.logs/ -log_filepath: ${log_dir}/log.txt - -name: launch_autogluon diff --git a/src/MEDS_tabular_automl/configs/launch_model.yaml b/src/MEDS_tabular_automl/configs/launch_model.yaml index 9938cd9..7008acf 100644 --- a/src/MEDS_tabular_automl/configs/launch_model.yaml +++ b/src/MEDS_tabular_automl/configs/launch_model.yaml @@ -1,39 +1,28 @@ defaults: - - _self_ - default - tabularization: default - - model: xgboost # This can be changed to sgd_classifier or any other model - - imputer: default - - normalization: default - - override hydra/callbacks: evaluation_callback + - model_launcher: xgboost - override hydra/sweeper: optuna - - override hydra/sweeper/sampler: tpe + - override hydra/callbacks: evaluation_callback - override hydra/launcher: joblib + - _self_ -task_name: task +task_name: ??? -# Task cached data dir -input_dir: ${output_cohort_dir}/${task_name}/task_cache -# Directory with task labels -input_label_dir: ${output_cohort_dir}/${task_name}/labels/ +# Location of task, split, and shard specific tabularized data +input_tabularized_cache_dir: ${output_dir}/${task_name}/task_cache +# Location of task, split, and shard specific label data +input_label_cache_dir: ${output_dir}/${task_name}/labels # Where to output the model and cached data -model_saving: - model_dir: ${output_cohort_dir}/model/model_${now:%Y-%m-%d_%H-%M-%S} - model_file_stem: model - model_file_extension: .json - delete_below_top_k: -1 -model_logging: - model_log_dir: ${model_saving.model_dir}/.logs/ - performance_log_stem: performance - config_log_stem: config +output_model_dir: ??? + +delete_below_top_k: -1 name: launch_model hydra: - verbose: False - job: - name: MEDS_TAB_${name}_${worker}_${now:%Y-%m-%d_%H-%M-%S} sweep: - dir: ${model_log_dir} + dir: ${output_model_dir}/sweeps/{now:%Y-%m-%d-%H-%M-%S}/ + subdir: "1" run: - dir: ${model_log_dir} + dir: ${path.model_log_dir} diff --git a/src/MEDS_tabular_automl/configs/model/knn_classifier.yaml b/src/MEDS_tabular_automl/configs/model/knn_classifier.yaml deleted file mode 100644 index 1ca034a..0000000 --- a/src/MEDS_tabular_automl/configs/model/knn_classifier.yaml +++ /dev/null @@ -1,44 +0,0 @@ -# @package _global_ - -model_target: - _target_: MEDS_tabular_automl.sklearn_model.SklearnModel.initialize - model_params: ${model_params} - input_dir: ${input_dir} - input_label_dir: ${input_label_dir} - model_dir: ${model_saving.model_dir} - model_file_stem: ${model_saving.model_file_stem} - model_file_extension: ${model_saving.model_file_extension} - log_dir: ${log_dir} - cache_dir: ${cache_dir} - imputer: ${model_params.iterator.imputer} - normalization: ${model_params.iterator.normalization} - -model_params: - epochs: 20 - early_stopping_rounds: 5 - model: - _target_: sklearn.neighbors.KNeighborsClassifier - weights: "distance" - leaf_size: 30 - p: 2 - metric: "minkowski" - iterator: - keep_data_in_memory: True - binarize_task: True - normalization: ${normalization} - imputer: ${imputer} - -hydra: - sweeper: - direction: maximize - n_trials: 250 - n_jobs: 25 - - params: - model_params.model.n_neighbors: range(1, 20) - model_params.model.weights: choice(['uniform', 'distance']) - model_params.model.leaf_size: range(10, 50) - model_params.model.p: choice([1, 2]) - model_params.model.metric: choice(['minkowski', 'euclidean', 'manhattan']) - model_params.epochs: range(10, 100) - model_params.early_stopping_rounds: range(1, 10) diff --git a/src/MEDS_tabular_automl/configs/model/logistic_regression.yaml b/src/MEDS_tabular_automl/configs/model/logistic_regression.yaml deleted file mode 100644 index 0f74a7b..0000000 --- a/src/MEDS_tabular_automl/configs/model/logistic_regression.yaml +++ /dev/null @@ -1,49 +0,0 @@ -# @package _global_ - -model_target: - _target_: MEDS_tabular_automl.sklearn_model.SklearnModel.initialize - model_params: ${model_params} - input_dir: ${input_dir} - input_label_dir: ${input_label_dir} - model_dir: ${model_saving.model_dir} - model_file_stem: ${model_saving.model_file_stem} - model_file_extension: ${model_saving.model_file_extension} - log_dir: ${log_dir} - cache_dir: ${cache_dir} - imputer: ${model_params.iterator.imputer} - normalization: ${model_params.iterator.normalization} - -model_params: - epochs: 20 - early_stopping_rounds: 5 - model: - _target_: sklearn.linear_model.LogisticRegression - penalty: "l2" - dual: false - tol: 0.0001 - C: 1.0 - fit_intercept: True - intercept_scaling: 1 - class_weight: null - random_state: null - solver: "lbfgs" - max_iter: 100 - - iterator: - keep_data_in_memory: True - binarize_task: True - normalization: ${normalization} - imputer: ${imputer} - -hydra: - sweeper: - direction: maximize - n_trials: 250 - n_jobs: 25 - - params: - model_params.model.C: tag(log, interval(1e-6, 1)) - model_params.model.penalty: choice(['l1', 'l2', 'elasticnet']) - model_params.model.solver: choice(['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']) - model_params.epochs: range(10, 100) - model_params.early_stopping_rounds: range(1, 10) diff --git a/src/MEDS_tabular_automl/configs/model/random_forest_classifier.yaml b/src/MEDS_tabular_automl/configs/model/random_forest_classifier.yaml deleted file mode 100644 index 58a9671..0000000 --- a/src/MEDS_tabular_automl/configs/model/random_forest_classifier.yaml +++ /dev/null @@ -1,51 +0,0 @@ -# @package _global_ - -model_target: - _target_: MEDS_tabular_automl.sklearn_model.SklearnModel.initialize - model_params: ${model_params} - input_dir: ${input_dir} - input_label_dir: ${input_label_dir} - model_dir: ${model_saving.model_dir} - model_file_stem: ${model_saving.model_file_stem} - model_file_extension: ${model_saving.model_file_extension} - log_dir: ${log_dir} - cache_dir: ${cache_dir} - imputer: ${model_params.iterator.imputer} - normalization: ${model_params.iterator.normalization} - -model_params: - epochs: 20 - early_stopping_rounds: 5 - model: - _target_: sklearn.ensemble.RandomForestClassifier - criterion: "gini" - max_depth: null - min_samples_split: 2 - min_samples_leaf: 1 - min_weight_fraction_leaf: 0.0 - max_features: "sqrt" - max_leaf_nodes: null - min_impurity_decrease: 0.0 - bootstrap: True - iterator: - keep_data_in_memory: True - binarize_task: True - normalization: ${normalization} - imputer: ${imputer} - -hydra: - sweeper: - direction: maximize - n_trials: 250 - n_jobs: 25 - - params: - model_params.model.n_estimators: range(50, 300, 50) - model_params.model.max_depth: choice([null, 10, 20, 30, 40, 50]) - model_params.model.min_samples_split: range(2, 11) - model_params.model.min_samples_leaf: range(1, 5) - model_params.model.max_features: choice(['sqrt', 'log2', null]) - model_params.model.bootstrap: choice([True, False]) - model_params.model.criterion: choice(['gini', 'entropy']) - model_params.epochs: range(10, 100) - model_params.early_stopping_rounds: range(1, 10) diff --git a/src/MEDS_tabular_automl/configs/model/sgd_classifier.yaml b/src/MEDS_tabular_automl/configs/model/sgd_classifier.yaml deleted file mode 100644 index 2f2b57f..0000000 --- a/src/MEDS_tabular_automl/configs/model/sgd_classifier.yaml +++ /dev/null @@ -1,38 +0,0 @@ -# @package _global_ - -model_target: - _target_: MEDS_tabular_automl.sklearn_model.SklearnModel.initialize - model_params: ${model_params} - input_dir: ${input_dir} - input_label_dir: ${input_label_dir} - model_dir: ${model_saving.model_dir} - model_file_stem: ${model_saving.model_file_stem} - model_file_extension: ${model_saving.model_file_extension} - log_dir: ${log_dir} - cache_dir: ${cache_dir} - imputer: ${model_params.iterator.imputer} - normalization: ${model_params.iterator.normalization} - -model_params: - epochs: 20 - early_stopping_rounds: 5 - model: - _target_: sklearn.linear_model.SGDClassifier - loss: log_loss - iterator: - keep_data_in_memory: True - binarize_task: True - normalization: ${normalization} - imputer: ${imputer} - -hydra: - sweeper: - direction: maximize - n_trials: 250 - n_jobs: 25 - params: - +model_params.model.alpha: tag(log, interval(1e-6, 1)) - +model_params.model.l1_ratio: interval(0, 1) - +model_params.model.penalty: choice(['l1', 'l2', 'elasticnet']) - model_params.epochs: range(10, 100) - model_params.early_stopping_rounds: range(1, 10) diff --git a/src/MEDS_tabular_automl/configs/model/xgboost.yaml b/src/MEDS_tabular_automl/configs/model/xgboost.yaml deleted file mode 100644 index 793cc29..0000000 --- a/src/MEDS_tabular_automl/configs/model/xgboost.yaml +++ /dev/null @@ -1,47 +0,0 @@ -# @package _global_ - -model_target: - _target_: MEDS_tabular_automl.xgboost_model.XGBoostModel.initialize - model_params: ${model_params} - input_dir: ${input_dir} - input_label_dir: ${input_label_dir} - model_dir: ${model_saving.model_dir} - model_file_stem: ${model_saving.model_file_stem} - model_file_extension: ${model_saving.model_file_extension} - log_dir: ${log_dir} - cache_dir: ${cache_dir} - imputer: ${model_params.iterator.imputer} - normalization: ${model_params.iterator.normalization} - # tabularization: ${tabularization} # Ideally we should define tabularization here, but there is an issue initializing with it's resolvers. - -model_params: - num_boost_round: 1000 - early_stopping_rounds: 5 - model: - booster: gbtree - device: cpu - nthread: 1 - tree_method: hist - objective: binary:logistic - iterator: - keep_data_in_memory: True - binarize_task: True - normalization: ${normalization} - imputer: ${imputer} - -hydra: - sweeper: - direction: maximize - n_trials: 250 - n_jobs: 25 - - params: - +model_params.model.eta: tag(log, interval(0.001, 1)) - +model_params.model.lambda: tag(log, interval(0.001, 1)) - +model_params.model.alpha: tag(log, interval(0.001, 1)) - +model_params.model.subsample: interval(0.5, 1) - +model_params.model.min_child_weight: interval(1e-2, 100) - model_params.num_boost_round: range(100, 1000) - model_params.early_stopping_rounds: range(1, 10) - +model_params.model.max_depth: range(2, 16) - tabularization.min_code_inclusion_frequency: tag(log, range(10, 1000000)) diff --git a/src/MEDS_tabular_automl/configs/model_launcher/__init__.py b/src/MEDS_tabular_automl/configs/model_launcher/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/MEDS_tabular_automl/configs/model_launcher/autogluon.yaml b/src/MEDS_tabular_automl/configs/model_launcher/autogluon.yaml new file mode 100644 index 0000000..b7d02cd --- /dev/null +++ b/src/MEDS_tabular_automl/configs/model_launcher/autogluon.yaml @@ -0,0 +1,3 @@ +defaults: + - default + - _self_ diff --git a/src/MEDS_tabular_automl/configs/model_launcher/data_loading_params/__init__.py b/src/MEDS_tabular_automl/configs/model_launcher/data_loading_params/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/MEDS_tabular_automl/configs/model_launcher/data_loading_params/default.yaml b/src/MEDS_tabular_automl/configs/model_launcher/data_loading_params/default.yaml new file mode 100644 index 0000000..723131f --- /dev/null +++ b/src/MEDS_tabular_automl/configs/model_launcher/data_loading_params/default.yaml @@ -0,0 +1,2 @@ +keep_data_in_memory: True +binarize_task: True diff --git a/src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/__init__.py b/src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/default.yaml b/src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/default.yaml new file mode 100644 index 0000000..5cf0c5b --- /dev/null +++ b/src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/default.yaml @@ -0,0 +1,3 @@ +defaults: + - imputer: default + - normalization: default diff --git a/src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/imputer/__init__.py b/src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/imputer/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/MEDS_tabular_automl/configs/imputer/default.yaml b/src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/imputer/default.yaml similarity index 100% rename from src/MEDS_tabular_automl/configs/imputer/default.yaml rename to src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/imputer/default.yaml diff --git a/src/MEDS_tabular_automl/configs/imputer/mean_imputer.yaml b/src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/imputer/mean_imputer.yaml similarity index 100% rename from src/MEDS_tabular_automl/configs/imputer/mean_imputer.yaml rename to src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/imputer/mean_imputer.yaml diff --git a/src/MEDS_tabular_automl/configs/imputer/median_imputer.yaml b/src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/imputer/median_imputer.yaml similarity index 100% rename from src/MEDS_tabular_automl/configs/imputer/median_imputer.yaml rename to src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/imputer/median_imputer.yaml diff --git a/src/MEDS_tabular_automl/configs/imputer/mode_imputer.yaml b/src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/imputer/mode_imputer.yaml similarity index 100% rename from src/MEDS_tabular_automl/configs/imputer/mode_imputer.yaml rename to src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/imputer/mode_imputer.yaml diff --git a/src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/normalization/__init__.py b/src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/normalization/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/MEDS_tabular_automl/configs/normalization/default.yaml b/src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/normalization/default.yaml similarity index 100% rename from src/MEDS_tabular_automl/configs/normalization/default.yaml rename to src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/normalization/default.yaml diff --git a/src/MEDS_tabular_automl/configs/normalization/max_abs_scaler.yaml b/src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/normalization/max_abs_scaler.yaml similarity index 100% rename from src/MEDS_tabular_automl/configs/normalization/max_abs_scaler.yaml rename to src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/normalization/max_abs_scaler.yaml diff --git a/src/MEDS_tabular_automl/configs/normalization/standard_scaler.yaml b/src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/normalization/standard_scaler.yaml similarity index 100% rename from src/MEDS_tabular_automl/configs/normalization/standard_scaler.yaml rename to src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/normalization/standard_scaler.yaml diff --git a/src/MEDS_tabular_automl/configs/model_launcher/default.yaml b/src/MEDS_tabular_automl/configs/model_launcher/default.yaml new file mode 100644 index 0000000..7b75e6e --- /dev/null +++ b/src/MEDS_tabular_automl/configs/model_launcher/default.yaml @@ -0,0 +1,13 @@ +# @package _global_ + +defaults: + - path: default + - data_processing_params: default + - data_loading_params: default + - _self_ + +model_launcher: + path: ${path} + data_processing_params: ${data_processing_params} + data_loading_params: ${data_loading_params} + tabularization: ${tabularization} diff --git a/src/MEDS_tabular_automl/configs/model_launcher/knn_classifier.yaml b/src/MEDS_tabular_automl/configs/model_launcher/knn_classifier.yaml new file mode 100644 index 0000000..9f85e97 --- /dev/null +++ b/src/MEDS_tabular_automl/configs/model_launcher/knn_classifier.yaml @@ -0,0 +1,30 @@ +# @package _global_ + +defaults: + - default + - training_params: default + - _self_ + +model_launcher: + _target_: MEDS_tabular_automl.sklearn_model.SklearnModel.initialize + + model: + _target_: sklearn.neighbors.KNeighborsClassifier + weights: "distance" + leaf_size: 30 + p: 2 + metric: "minkowski" + + path: + model_file_extension: .pkl + +hydra: + sweeper: + params: + +model_launcher.model.n_neighbors: range(1, 20) + model_launcher.model.weights: choice('uniform', 'distance') + model_launcher.model.leaf_size: range(10, 50) + model_launcher.model.p: choice(1, 2) + model_launcher.model.metric: choice('minkowski', 'euclidean', 'manhattan') + model_launcher.training_params.epochs: range(10, 100) + model_launcher.training_params.early_stopping_rounds: range(1, 10) diff --git a/src/MEDS_tabular_automl/configs/model_launcher/logistic_regression.yaml b/src/MEDS_tabular_automl/configs/model_launcher/logistic_regression.yaml new file mode 100644 index 0000000..4531efc --- /dev/null +++ b/src/MEDS_tabular_automl/configs/model_launcher/logistic_regression.yaml @@ -0,0 +1,33 @@ +# @package _global_ + +defaults: + - default + - training_params: default + - _self_ + +model_launcher: + _target_: MEDS_tabular_automl.sklearn_model.SklearnModel.initialize + + model: + _target_: sklearn.linear_model.LogisticRegression + penalty: "l2" + dual: false + tol: 0.0001 + C: 1.0 + fit_intercept: True + intercept_scaling: 1 + class_weight: null + random_state: null + solver: "lbfgs" + max_iter: 100 + + path: + model_file_extension: .pkl + +hydra: + sweeper: + params: + model_launcher.model.C: tag(log, interval(1e-6, 1)) + model_launcher.model.solver: choice('lbfgs', 'sag', 'saga') + model_launcher.training_params.epochs: range(10, 100) + model_launcher.training_params.early_stopping_rounds: range(1, 10) diff --git a/src/MEDS_tabular_automl/configs/model_launcher/path/__init__.py b/src/MEDS_tabular_automl/configs/model_launcher/path/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/MEDS_tabular_automl/configs/model_launcher/path/default.yaml b/src/MEDS_tabular_automl/configs/model_launcher/path/default.yaml new file mode 100644 index 0000000..d739ce3 --- /dev/null +++ b/src/MEDS_tabular_automl/configs/model_launcher/path/default.yaml @@ -0,0 +1,10 @@ +input_tabularized_cache_dir: ${input_tabularized_cache_dir} +input_label_cache_dir: ${input_label_cache_dir} +output_model_dir: ${output_model_dir} +model_file_stem: model +model_file_extension: .json +log_dir: ${log_dir} +cache_dir: ${cache_dir} +model_log_dir: ${output_model_dir}/.logs/ +performance_log_stem: performance +config_log_stem: config diff --git a/src/MEDS_tabular_automl/configs/model_launcher/random_forest_classifier.yaml b/src/MEDS_tabular_automl/configs/model_launcher/random_forest_classifier.yaml new file mode 100644 index 0000000..4a50beb --- /dev/null +++ b/src/MEDS_tabular_automl/configs/model_launcher/random_forest_classifier.yaml @@ -0,0 +1,37 @@ +# @package _global_ + +defaults: + - default + - training_params: default + - _self_ + +model_launcher: + _target_: MEDS_tabular_automl.sklearn_model.SklearnModel.initialize + + model: + _target_: sklearn.ensemble.RandomForestClassifier + criterion: "gini" + max_depth: null + min_samples_split: 2 + min_samples_leaf: 1 + min_weight_fraction_leaf: 0.0 + max_features: "sqrt" + max_leaf_nodes: null + min_impurity_decrease: 0.0 + bootstrap: True + + path: + model_file_extension: .pkl + +hydra: + sweeper: + params: + +model_launcher.model.n_estimators: range(50, 300, 50) + model_launcher.model.max_depth: choice(10, 20, 30, 40, 50) + model_launcher.model.min_samples_split: range(2, 11) + model_launcher.model.min_samples_leaf: range(1, 5) + model_launcher.model.max_features: choice('sqrt', 'log2') + model_launcher.model.bootstrap: choice(True, False) + model_launcher.model.criterion: choice('gini', 'entropy') + model_launcher.training_params.epochs: range(10, 100) + model_launcher.training_params.early_stopping_rounds: range(1, 10) diff --git a/src/MEDS_tabular_automl/configs/model_launcher/sgd_classifier.yaml b/src/MEDS_tabular_automl/configs/model_launcher/sgd_classifier.yaml new file mode 100644 index 0000000..9f6cb1d --- /dev/null +++ b/src/MEDS_tabular_automl/configs/model_launcher/sgd_classifier.yaml @@ -0,0 +1,24 @@ +# @package _global_ +defaults: + - default + - training_params: default + - _self_ + +model_launcher: + _target_: MEDS_tabular_automl.sklearn_model.SklearnModel.initialize + + model: + _target_: sklearn.linear_model.SGDClassifier + loss: log_loss + + path: + model_file_extension: .pkl + +hydra: + sweeper: + params: + +model_launcher.model.alpha: tag(log, interval(1e-6, 1)) + +model_launcher.model.l1_ratio: interval(0, 1) + +model_launcher.model.penalty: choice('l1', 'l2', 'elasticnet') + model_launcher.training_params.epochs: range(10, 100) + model_launcher.training_params.early_stopping_rounds: range(1, 10) diff --git a/src/MEDS_tabular_automl/configs/model_launcher/training_params/__init__.py b/src/MEDS_tabular_automl/configs/model_launcher/training_params/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/MEDS_tabular_automl/configs/model_launcher/training_params/default.yaml b/src/MEDS_tabular_automl/configs/model_launcher/training_params/default.yaml new file mode 100644 index 0000000..abd29f2 --- /dev/null +++ b/src/MEDS_tabular_automl/configs/model_launcher/training_params/default.yaml @@ -0,0 +1,6 @@ +# @package _global_ + +model_launcher: + training_params: + epochs: 20 + early_stopping_rounds: 5 diff --git a/src/MEDS_tabular_automl/configs/model_launcher/xgboost.yaml b/src/MEDS_tabular_automl/configs/model_launcher/xgboost.yaml new file mode 100644 index 0000000..b7e9065 --- /dev/null +++ b/src/MEDS_tabular_automl/configs/model_launcher/xgboost.yaml @@ -0,0 +1,31 @@ +# @package _global_ +defaults: + - default + - _self_ + +model_launcher: + _target_: MEDS_tabular_automl.xgboost_model.XGBoostModel.initialize + + model: + booster: gbtree + device: cpu + nthread: 1 + tree_method: hist + objective: binary:logistic + + training_params: + num_boost_round: 1000 + early_stopping_rounds: 5 + +hydra: + sweeper: + params: + +model_launcher.model.eta: tag(log, interval(0.001, 1)) + +model_launcher.model.lambda: tag(log, interval(0.001, 1)) + +model_launcher.model.alpha: tag(log, interval(0.001, 1)) + +model_launcher.model.subsample: interval(0.5, 1) + +model_launcher.model.min_child_weight: interval(1e-2, 100) + +model_launcher.model.max_depth: range(2, 16) + model_launcher.training_params.num_boost_round: range(100, 1000) + model_launcher.training_params.early_stopping_rounds: range(1, 10) + tabularization.min_code_inclusion_frequency: tag(log, range(10, 1000000)) diff --git a/src/MEDS_tabular_automl/configs/tabularization.yaml b/src/MEDS_tabular_automl/configs/tabularization.yaml index cf03d63..5d74eb9 100644 --- a/src/MEDS_tabular_automl/configs/tabularization.yaml +++ b/src/MEDS_tabular_automl/configs/tabularization.yaml @@ -5,8 +5,8 @@ defaults: # Raw data # Where the code metadata is stored -input_code_metadata_fp: ${output_cohort_dir}/metadata/codes.parquet -input_dir: ${output_cohort_dir}/data -output_dir: ${output_cohort_dir}/tabularize +input_code_metadata_fp: ${output_dir}/metadata/codes.parquet +input_dir: ${input_dir} +output_tabularized_dir: ${output_dir}/tabularize name: tabularization diff --git a/src/MEDS_tabular_automl/configs/tabularization/default.yaml b/src/MEDS_tabular_automl/configs/tabularization/default.yaml index ada7dc9..8c51383 100644 --- a/src/MEDS_tabular_automl/configs/tabularization/default.yaml +++ b/src/MEDS_tabular_automl/configs/tabularization/default.yaml @@ -1,5 +1,5 @@ # User inputs -filtered_code_metadata_fp: ${output_cohort_dir}/metadata/codes.parquet +filtered_code_metadata_fp: ${output_dir}/metadata/codes.parquet allowed_codes: null min_code_inclusion_count: 10 min_code_inclusion_frequency: null diff --git a/src/MEDS_tabular_automl/configs/task_specific_caching.yaml b/src/MEDS_tabular_automl/configs/task_specific_caching.yaml index 63fed0f..a372134 100644 --- a/src/MEDS_tabular_automl/configs/task_specific_caching.yaml +++ b/src/MEDS_tabular_automl/configs/task_specific_caching.yaml @@ -2,15 +2,16 @@ defaults: - default - tabularization: default - _self_ -task_name: task +task_name: ??? -# Tabularized Data -input_dir: ${output_cohort_dir}/tabularize +# Directory of tabularized data +input_tabularized_dir: ${output_dir}/tabularize # Where the labels are stored, with columns subject_id, timestamp, label -input_label_dir: ${MEDS_cohort_dir}/tasks/${task_name}/ -# Where to output the task specific tabularized data -output_dir: ${output_cohort_dir}/${task_name}/task_cache -output_label_dir: ${output_cohort_dir}/${task_name}/labels +input_label_dir: ??? +# Where to output the task, split, and shard specific tabularized data +output_tabularized_cache_dir: ${output_dir}/${task_name}/task_cache +# Where to output the task, split, and shard specific label data +output_label_cache_dir: ${output_dir}/${task_name}/labels label_column: "boolean_value" diff --git a/src/MEDS_tabular_automl/describe_codes.py b/src/MEDS_tabular_automl/describe_codes.py index 70c53bd..23cdb98 100644 --- a/src/MEDS_tabular_automl/describe_codes.py +++ b/src/MEDS_tabular_automl/describe_codes.py @@ -2,7 +2,7 @@ import polars as pl -from MEDS_tabular_automl.utils import DF_T, get_feature_names +from MEDS_tabular_automl.utils import get_feature_names def convert_to_df(freq_dict: dict[str, int]) -> pl.DataFrame: @@ -65,7 +65,7 @@ def convert_to_freq_dict(df: pl.LazyFrame) -> dict[str, dict[int, int]]: return dict(df.collect().iter_rows()) -def compute_feature_frequencies(shard_df: DF_T) -> pl.DataFrame: +def compute_feature_frequencies(shard_df: pl.LazyFrame) -> pl.DataFrame: """Generates a DataFrame containing the frequencies of codes and numerical values under different aggregations by computing frequency counts for certain attributes and organizing the results into specific categories based on the dataset's features. diff --git a/src/MEDS_tabular_automl/file_name.py b/src/MEDS_tabular_automl/file_name.py index 4ca144a..36b2861 100644 --- a/src/MEDS_tabular_automl/file_name.py +++ b/src/MEDS_tabular_automl/file_name.py @@ -73,7 +73,7 @@ def get_model_files(cfg: DictConfig, split: str, shard: str) -> list[Path]: Examples: >>> cfg = DictConfig({ - ... "input_dir": "data", + ... "path": DictConfig({"input_tabularized_cache_dir" : "data"}), ... "tabularization": { ... "window_sizes": ["1d", "7d"], ... "aggs": ["code/count", "value/sum", "static/present"], @@ -94,7 +94,7 @@ def get_model_files(cfg: DictConfig, split: str, shard: str) -> list[Path]: """ window_sizes = cfg.tabularization.window_sizes aggs = cfg.tabularization.aggs - shard_dir = Path(cfg.input_dir) / split / shard + shard_dir = Path(cfg.path.input_tabularized_cache_dir) / split / shard # Given a shard number, returns the model files model_files = [] for window_size in window_sizes: diff --git a/src/MEDS_tabular_automl/scripts/cache_task.py b/src/MEDS_tabular_automl/scripts/cache_task.py index 0903aa3..a437119 100644 --- a/src/MEDS_tabular_automl/scripts/cache_task.py +++ b/src/MEDS_tabular_automl/scripts/cache_task.py @@ -25,6 +25,7 @@ hydra_loguru_init, load_matrix, load_tqdm, + stage_init, write_df, ) @@ -79,17 +80,36 @@ def main(cfg: DictConfig): Args: cfg: The configuration for processing, loaded from a YAML file. """ + stage_init( + cfg, + [ + "input_dir", + "input_label_dir", + "input_tabularized_dir", + "output_dir", + "tabularization.filtered_code_metadata_fp", + ], + ) iter_wrapper = load_tqdm(cfg.tqdm) if not cfg.loguru_init: hydra_loguru_init() # Produce ts representation # shuffle tasks - tabularization_tasks = list_subdir_files(cfg.input_dir, "npz") + tabularization_tasks = list_subdir_files(cfg.input_tabularized_dir, "npz") + if len(tabularization_tasks) == 0: + raise FileNotFoundError( + f"No tabularized data found, `input_tabularized_dir`: {cfg.input_tabularized_dir}, " + "is likely incorrect" + ) np.random.shuffle(tabularization_tasks) label_dir = Path(cfg.input_label_dir) + if not label_dir.exists(): + raise FileNotFoundError( + f"Label directory {label_dir} does not exist, please check the `input_label_dir` kwarg" + ) label_df = ( pl.scan_parquet(label_dir / "**/*.parquet") .rename( @@ -108,9 +128,11 @@ def main(cfg: DictConfig): for data_fp in iter_wrapper(tabularization_tasks): # parse as time series agg split, shard_num, window_size, code_type, agg_name = Path(data_fp).with_suffix("").parts[-5:] - meds_data_in_fp = Path(cfg.output_cohort_dir) / "data" / split / f"{shard_num}.parquet" - shard_label_fp = Path(cfg.output_label_dir) / split / f"{shard_num}.parquet" - out_fp = (Path(cfg.output_dir) / get_shard_prefix(cfg.input_dir, data_fp)).with_suffix(".npz") + meds_data_in_fp = Path(cfg.input_dir) / split / f"{shard_num}.parquet" + shard_label_fp = Path(cfg.output_label_cache_dir) / split / f"{shard_num}.parquet" + out_fp = ( + Path(cfg.output_tabularized_cache_dir) / get_shard_prefix(cfg.input_tabularized_dir, data_fp) + ).with_suffix(".npz") def read_meds_data_df(meds_data_fp): if "numeric_value" not in pl.scan_parquet(meds_data_fp).columns: diff --git a/src/MEDS_tabular_automl/scripts/describe_codes.py b/src/MEDS_tabular_automl/scripts/describe_codes.py index c29b542..4391ccf 100644 --- a/src/MEDS_tabular_automl/scripts/describe_codes.py +++ b/src/MEDS_tabular_automl/scripts/describe_codes.py @@ -17,7 +17,7 @@ ) from ..file_name import list_subdir_files from ..mapper import wrap as rwlock_wrap -from ..utils import get_shard_prefix, hydra_loguru_init, load_tqdm, write_df +from ..utils import get_shard_prefix, hydra_loguru_init, load_tqdm, stage_init, write_df config_yaml = files("MEDS_tabular_automl").joinpath("configs/describe_codes.yaml") if not config_yaml.is_file(): @@ -32,6 +32,7 @@ def main(cfg: DictConfig): cfg: The configuration object for the tabularization process, loaded from a Hydra YAML configuration file. """ + stage_init(cfg, ["input_dir"]) iter_wrapper = load_tqdm(cfg.tqdm) if not cfg.loguru_init: hydra_loguru_init() diff --git a/src/MEDS_tabular_automl/scripts/launch_autogluon.py b/src/MEDS_tabular_automl/scripts/launch_autogluon.py index db61e9f..ccbc93d 100644 --- a/src/MEDS_tabular_automl/scripts/launch_autogluon.py +++ b/src/MEDS_tabular_automl/scripts/launch_autogluon.py @@ -1,20 +1,33 @@ +import json from importlib.resources import files from pathlib import Path import hydra import pandas as pd from loguru import logger -from omegaconf import DictConfig +from omegaconf import DictConfig, OmegaConf + +try: + import autogluon.tabular as ag +except ImportError: + ag = None from MEDS_tabular_automl.dense_iterator import DenseIterator -from ..utils import hydra_loguru_init +from ..utils import hydra_loguru_init, stage_init -config_yaml = files("MEDS_tabular_automl").joinpath("configs/launch_autogluon.yaml") +config_yaml = files("MEDS_tabular_automl").joinpath("configs/launch_model.yaml") if not config_yaml.is_file(): raise FileNotFoundError("Core configuration not successfully installed!") +def check_autogluon(): + if ag is None: + raise ImportError( + "AutoGluon could not be imported. Please try installing it using: `pip install autogluon`" + ) + + @hydra.main(version_base=None, config_path=str(config_yaml.parent.resolve()), config_name=config_yaml.stem) def main(cfg: DictConfig) -> float: """Launches AutoGluon after collecting data based on the provided configuration. @@ -22,17 +35,13 @@ def main(cfg: DictConfig) -> float: Args: cfg: The configuration dictionary specifying model and training parameters. """ - - # print(OmegaConf.to_yaml(cfg)) + check_autogluon() + stage_init( + cfg, ["input_dir", "input_label_cache_dir", "output_dir", "tabularization.filtered_code_metadata_fp"] + ) if not cfg.loguru_init: hydra_loguru_init() - # check that autogluon is installed - try: - import autogluon.tabular as ag - except ImportError: - logger.error("AutoGluon is not installed. Please install AutoGluon.") - # collect data based on the configuration itrain = DenseIterator(cfg, "train") ituning = DenseIterator(cfg, "tuning") @@ -44,13 +53,13 @@ def main(cfg: DictConfig) -> float: held_out_data, held_out_labels = iheld_out.densify() # construct dfs for AutoGluon - train_df = pd.DataFrame(train_data.todense()) # , columns=cols) + train_df = pd.DataFrame(train_data.todense()) train_df[cfg.task_name] = train_labels tuning_df = pd.DataFrame( tuning_data.todense(), - ) # columns=cols) + ) tuning_df[cfg.task_name] = tuning_labels - held_out_df = pd.DataFrame(held_out_data.todense()) # , columns=cols) + held_out_df = pd.DataFrame(held_out_data.todense()) held_out_df[cfg.task_name] = held_out_labels train_dataset = ag.TabularDataset(train_df) @@ -58,8 +67,13 @@ def main(cfg: DictConfig) -> float: held_out_dataset = ag.TabularDataset(held_out_df) # train model with AutoGluon + log_filepath = Path(cfg.path.model_log_dir) / f"{cfg.path.config_log_stem}_log.txt" + predictor = ag.TabularPredictor( - label=cfg.task_name, log_to_file=True, log_file_path=cfg.log_filepath, path=cfg.output_filepath + label=cfg.task_name, + log_to_file=True, + log_file_path=str(log_filepath.resolve()), + path=cfg.output_model_dir, ).fit(train_data=train_dataset, tuning_data=tuning_dataset) # predict @@ -69,12 +83,17 @@ def main(cfg: DictConfig) -> float: score = predictor.evaluate(held_out_dataset) logger.info("Test score:", score) - log_fp = Path(cfg.model_log_dir) - log_fp.mkdir(parents=True, exist_ok=True) - # log hyperparameters - out_fp = log_fp / "trial_performance_results.log" - with open(out_fp, "w") as f: - f.write(f"{cfg.output_filepath}\t{cfg.tabularization}\t{cfg.model_params}\t{None}\t{score}\n") + model_performance_log_filepath = Path(cfg.path.model_log_dir) / f"{cfg.path.performance_log_stem}.json" + model_performance_log_filepath.parent.mkdir(parents=True, exist_ok=True) + # store results + performance_dict = { + "output_model_dir": cfg.path.output_model_dir, + "tabularization": OmegaConf.to_container(cfg.tabularization), + "model_launcher": OmegaConf.to_container(cfg.model_launcher), + "score": score, + } + with open(model_performance_log_filepath, "w") as f: + json.dump(performance_dict, f) if __name__ == "__main__": diff --git a/src/MEDS_tabular_automl/scripts/launch_model.py b/src/MEDS_tabular_automl/scripts/launch_model.py index 97943b4..9f8b8da 100644 --- a/src/MEDS_tabular_automl/scripts/launch_model.py +++ b/src/MEDS_tabular_automl/scripts/launch_model.py @@ -3,11 +3,11 @@ from pathlib import Path import hydra -from omegaconf import DictConfig, open_dict +from omegaconf import DictConfig from MEDS_tabular_automl.base_model import BaseModel -from ..utils import hydra_loguru_init, log_to_logfile +from ..utils import hydra_loguru_init, log_to_logfile, stage_init config_yaml = files("MEDS_tabular_automl").joinpath("configs/launch_model.yaml") if not config_yaml.is_file(): @@ -24,32 +24,29 @@ def main(cfg: DictConfig) -> float: Returns: The evaluation result as the ROC AUC score on the held-out test set. """ + stage_init( + cfg, ["input_dir", "input_label_cache_dir", "output_dir", "tabularization.filtered_code_metadata_fp"] + ) - # print(OmegaConf.to_yaml(cfg)) if not cfg.loguru_init: hydra_loguru_init() - model: BaseModel = hydra.utils.instantiate(cfg.model_target) - # TODO - make tabularuzation be copied in the yaml instead of here - with open_dict(cfg): - model.cfg.tabularization = hydra.utils.instantiate(cfg.tabularization) + model_launcher: BaseModel = hydra.utils.instantiate(cfg.model_launcher) - model.train() - auc = model.evaluate() - # logger.info(f"AUC: {auc}") + model_launcher.train() + auc = model_launcher.evaluate() # save model - output_fp = Path(cfg.model_saving.model_dir) - output_fp = ( - output_fp.parent - / f"{cfg.model_saving.model_file_stem}_{auc:.4f}_{time.time()}{cfg.model_saving.model_file_extension}" - ) - output_fp.parent.mkdir(parents=True, exist_ok=True) + output_model_dir = Path(cfg.output_model_dir) + path_cfg = model_launcher.cfg.path + model_filename = f"{path_cfg.model_file_stem}_{auc:.4f}_{time.time()}{path_cfg.model_file_extension}" + output_fp = output_model_dir / model_filename + output_model_dir.parent.mkdir(parents=True, exist_ok=True) # log to logfile - log_to_logfile(model, cfg, output_fp.stem) + log_to_logfile(model_launcher, cfg, output_fp.stem) - model.save_model(output_fp) + model_launcher.save_model(output_fp) return auc diff --git a/src/MEDS_tabular_automl/scripts/tabularize_static.py b/src/MEDS_tabular_automl/scripts/tabularize_static.py index 34d9c0d..2baf406 100644 --- a/src/MEDS_tabular_automl/scripts/tabularize_static.py +++ b/src/MEDS_tabular_automl/scripts/tabularize_static.py @@ -30,6 +30,7 @@ get_shard_prefix, hydra_loguru_init, load_tqdm, + stage_init, write_df, ) @@ -56,7 +57,7 @@ def main( Args: cfg: - MEDS_cohort_dir: directory of MEDS format dataset that is ingested. + input_dir: directory of MEDS format dataset that is ingested. tabularized_data_dir: output directory of tabularized data. min_code_inclusion_frequency: The base feature inclusion frequency that should be used to dictate what features can be included in the flat representation. It can either be a float, in which @@ -81,6 +82,14 @@ def main( .. _link: https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.groupby_rolling.html # noqa: E501 """ + stage_init( + cfg, + [ + "input_code_metadata_fp", + "input_dir", + "tabularization.filtered_code_metadata_fp", + ], + ) iter_wrapper = load_tqdm(cfg.tqdm) if not cfg.loguru_init: hydra_loguru_init() @@ -130,7 +139,7 @@ def write_fn(data, out_fp): np.random.shuffle(tabularization_tasks) for shard_fp, agg in iter_wrapper(tabularization_tasks): out_fp = ( - Path(cfg.output_dir) / get_shard_prefix(cfg.input_dir, shard_fp) / "none" / agg + Path(cfg.output_tabularized_dir) / get_shard_prefix(cfg.input_dir, shard_fp) / "none" / agg ).with_suffix(".npz") if out_fp.exists() and not cfg.do_overwrite: raise FileExistsError(f"do_overwrite is {cfg.do_overwrite} and {out_fp} exists!") diff --git a/src/MEDS_tabular_automl/scripts/tabularize_time_series.py b/src/MEDS_tabular_automl/scripts/tabularize_time_series.py index 8eca7f4..98cca20 100644 --- a/src/MEDS_tabular_automl/scripts/tabularize_time_series.py +++ b/src/MEDS_tabular_automl/scripts/tabularize_time_series.py @@ -26,6 +26,7 @@ get_shard_prefix, hydra_loguru_init, load_tqdm, + stage_init, write_df, ) @@ -64,6 +65,14 @@ def main( FileNotFoundError: If specified directories or files in the configuration are not found. ValueError: If required columns like 'code' or 'value' are missing in the data files. """ + stage_init( + cfg, + [ + "input_code_metadata_fp", + "input_dir", + "tabularization.filtered_code_metadata_fp", + ], + ) iter_wrapper = load_tqdm(cfg.tqdm) if not cfg.loguru_init: hydra_loguru_init() @@ -83,7 +92,7 @@ def main( # iterate through them for shard_fp, window_size, agg in iter_wrapper(tabularization_tasks): out_fp = ( - Path(cfg.output_dir) / get_shard_prefix(cfg.input_dir, shard_fp) / window_size / agg + Path(cfg.output_tabularized_dir) / get_shard_prefix(cfg.input_dir, shard_fp) / window_size / agg ).with_suffix(".npz") def read_fn(in_fp): diff --git a/src/MEDS_tabular_automl/sklearn_model.py b/src/MEDS_tabular_automl/sklearn_model.py index 56063fd..12ed980 100644 --- a/src/MEDS_tabular_automl/sklearn_model.py +++ b/src/MEDS_tabular_automl/sklearn_model.py @@ -1,4 +1,5 @@ from pathlib import Path +from pickle import dump import numpy as np import scipy.sparse as sp @@ -101,7 +102,7 @@ def __init__(self, cfg: DictConfig): """ super().__init__() self.cfg = cfg - self.keep_data_in_memory = cfg.model_params.iterator.keep_data_in_memory + self.keep_data_in_memory = cfg.data_loading_params.keep_data_in_memory self.itrain = None self.ituning = None @@ -111,7 +112,7 @@ def __init__(self, cfg: DictConfig): self.dtuning = None self.dheld_out = None - self.model = cfg.model_params.model + self.model = cfg.model # check that self.model is a valid model if not hasattr(self.model, "fit"): raise ValueError("Model does not have a fit method.") @@ -133,7 +134,7 @@ def _fit_from_partial(self): classes = self.itrain.get_classes() best_auc = 0 best_epoch = 0 - for epoch in range(self.cfg.model_params.epochs): + for epoch in range(self.cfg.training_params.epochs): # train on each all data for shard_idx in range(len(self.itrain._data_shards)): data, labels = self.itrain.get_data_shards(shard_idx) @@ -144,7 +145,7 @@ def _fit_from_partial(self): if auc > best_auc: best_auc = auc best_epoch = epoch - if epoch - best_epoch > self.cfg.model_params.early_stopping_rounds: + if epoch - best_epoch > self.cfg.training_params.early_stopping_rounds: break def _train(self): @@ -224,9 +225,9 @@ def save_model(self, output_fp: str): if not hasattr(self.model, "save_model"): logger.info(f"Model {self.model.__class__.__name__} does not have a save_model method.") logger.info("Model will be saved using pickle dump.") - from pickle import dump - - with open(output_fp.parent / "model.pkl", "wb") as f: + if not str(output_fp.resolve()).endswith(".pkl"): + raise ValueError("Model file extension must be .pkl.") + with open(output_fp, "wb") as f: dump(self.model, f, protocol=5) else: self.model.save_model(output_fp) diff --git a/src/MEDS_tabular_automl/tabular_dataset.py b/src/MEDS_tabular_automl/tabular_dataset.py index 84a6609..fa402b7 100644 --- a/src/MEDS_tabular_automl/tabular_dataset.py +++ b/src/MEDS_tabular_automl/tabular_dataset.py @@ -47,13 +47,21 @@ def __init__(self, cfg: DictConfig, split: str = "train"): split: The data split to use, which can be one of "train", "tuning", or "held_out". This determines which subset of the data is loaded and processed. """ - super().__init__(cache_prefix=Path(cfg.cache_dir)) + super().__init__(cache_prefix=Path(cfg.path.cache_dir)) self.cfg = cfg self.split = split # Load shards for this split self._data_shards = sorted( - [shard.stem for shard in list_subdir_files(Path(cfg.input_label_dir) / split, "parquet")] + [ + shard.stem + for shard in list_subdir_files(Path(cfg.path.input_label_cache_dir) / split, "parquet") + ] ) + if len(self._data_shards) == 0: + raise ValueError( + "No labels found in the `input_label_cache_dir` " + + str(Path(cfg.path.input_label_cache_dir).resolve()) + ) self.valid_event_ids, self.labels = None, None self.codes_set, self.code_masks, self.num_features = self._get_code_set() @@ -114,7 +122,7 @@ def _load_ids_and_labels( to lists of corresponding labels. """ label_fps = { - shard: (Path(self.cfg.input_label_dir) / self.split / shard).with_suffix(".parquet") + shard: (Path(self.cfg.path.input_label_cache_dir) / self.split / shard).with_suffix(".parquet") for shard in self._data_shards for shard in self._data_shards } @@ -126,7 +134,7 @@ def _load_ids_and_labels( if load_labels: cached_labels[shard] = label_df.select(pl.col("label")).collect().to_series() - if self.cfg.model_params.iterator.binarize_task: + if self.cfg.data_loading_params.binarize_task: cached_labels[shard] = cached_labels[shard].map_elements( lambda x: 1 if x > 0 else 0, return_dtype=pl.Int8 ) @@ -221,10 +229,10 @@ def _get_approximate_correlation_per_feature(self, X: sp.csc_matrix, y: np.ndarr def _set_imputer(self): """Sets the imputer for the data.""" if ( - hasattr(self.cfg.model_params.iterator, "imputer") - and self.cfg.model_params.iterator.imputer.imputer_target + hasattr(self.cfg.data_loading_params, "imputer") + and self.cfg.data_loading_params.imputer.imputer_target ): - imputer = self.cfg.model_params.iterator.imputer.imputer_target + imputer = self.cfg.data_loading_params.imputer.imputer_target if hasattr(imputer, "partial_fit"): for i in range(len(self._data_shards)): X, _ = self._get_shard_by_index(i) @@ -240,10 +248,10 @@ def _set_imputer(self): def _set_scaler(self): """Sets the scaler for the data.""" if ( - hasattr(self.cfg.model_params.iterator, "normalization") - and self.cfg.model_params.iterator.normalization.normalizer + hasattr(self.cfg.data_loading_params, "normalization") + and self.cfg.data_loading_params.normalization.normalizer ): - scaler = self.cfg.model_params.iterator.normalization.normalizer + scaler = self.cfg.data_loading_params.normalization.normalizer if hasattr(scaler, "partial_fit"): for i in range(len(self._data_shards)): X, _ = self._get_shard_by_index(i) diff --git a/src/MEDS_tabular_automl/utils.py b/src/MEDS_tabular_automl/utils.py index 3d6f496..97a74f9 100644 --- a/src/MEDS_tabular_automl/utils.py +++ b/src/MEDS_tabular_automl/utils.py @@ -1,21 +1,15 @@ -"""The base class for core dataset processing logic. - -Attributes: - INPUT_DF_T: This defines the type of the allowable input dataframes -- e.g., databases, filepaths, - dataframes, etc. - DF_T: This defines the type of internal dataframes -- e.g. polars DataFrames. -""" +"""The base class for core dataset processing logic and script utilities.""" import os +import sys from pathlib import Path import hydra import numpy as np import polars as pl from loguru import logger -from omegaconf import OmegaConf +from omegaconf import DictConfig, ListConfig, OmegaConf from scipy.sparse import coo_array -DF_T = pl.LazyFrame WRITE_USE_PYARROW = True ROW_IDX_NAME = "__row_idx" @@ -51,7 +45,7 @@ def filter_to_codes( min_code_inclusion_count: int | None, min_code_inclusion_frequency: float | None, max_include_codes: int | None, -) -> list[str]: +) -> ListConfig[str]: """Filters and returns codes based on allowed list and minimum frequency. Args: @@ -69,8 +63,14 @@ def filter_to_codes( ... pl.DataFrame({"code": ["E", "D", "A"], "count": [4, 3, 2]}).write_parquet(f.name) ... filter_to_codes( f.name, ["A", "D"], 3, None, None) ['D'] + >>> with NamedTemporaryFile() as f: + ... pl.DataFrame({"code": ["E", "D", "A"], "count": [4, 3, 2]}).write_parquet(f.name) + ... filter_to_codes( f.name, ["A", "D"], 10, None, None) + Traceback (most recent call last): + ... + ValueError: Code filtering criteria ... + ... """ - feature_freqs = pl.read_parquet(code_metadata_fp) if allowed_codes is not None: @@ -78,18 +78,23 @@ def filter_to_codes( if min_code_inclusion_frequency is not None: pass - # need to consider size of the dataset vs count - - # feature_freqs = feature_freqs.filter(pl.col("frequency") >= min_code_inclusion_frequency) if min_code_inclusion_count is not None: feature_freqs = feature_freqs.filter(pl.col("count") >= min_code_inclusion_count) if max_include_codes is not None: - # feature_freqs = feature_freqs.sort("count", reverse=True).head(max_include_codes) feature_freqs = feature_freqs.sort("count", descending=True).head(max_include_codes) - return sorted(feature_freqs["code"].to_list()) + if len(feature_freqs["code"]) == 0: + raise ValueError( + f"Code filtering criteria leaves only 0 codes. Note that {feature_freqs.shape[0]} " + "codes are read in, try modifying the following kwargs:" + f"\n- tabularization.allowed_codes: {allowed_codes}" + f"\n- tabularization.min_code_inclusion_count: {min_code_inclusion_count}" + f"\n- tabularization.min_code_inclusion_frequency: {min_code_inclusion_frequency}" + f"\n- tabularization.max_include_codes: {max_include_codes}" + ) + return ListConfig(sorted(feature_freqs["code"].to_list())) OmegaConf.register_new_resolver("filter_to_codes", filter_to_codes, replace=True) @@ -405,20 +410,69 @@ def log_to_logfile(model, cfg, output_fp): cfg: The configuration dictionary. output_fp: The relative output file path. """ - log_fp = Path(cfg.model_logging.model_log_dir) + log_fp = Path(cfg.path.model_log_dir) # make a folder to log everything for this model out_fp = log_fp / output_fp out_fp.mkdir(parents=True, exist_ok=True) # config as a json - config_fp = out_fp / f"{cfg.model_logging.config_log_stem}.log" + config_fp = out_fp / f"{cfg.path.config_log_stem}.json" with open(config_fp, "w") as f: f.write(OmegaConf.to_yaml(cfg)) - model_performance_fp = out_fp / f"{cfg.model_logging.performance_log_stem}.log" + model_performance_fp = out_fp / f"{cfg.path.performance_log_stem}.log" with open(model_performance_fp, "w") as f: f.write("model_fp,tuning_auc,test_auc\n") f.write(f"{output_fp},{model.evaluate()},{model.evaluate(split='held_out')}\n") logger.debug(f"Model config and performance logged to {config_fp} and {model_performance_fp}") + + +def current_script_name() -> str: + """Returns the name of the module that called this function.""" + + main_module = sys.modules["__main__"] + main_func = getattr(main_module, "main", None) + if main_func and callable(main_func): + func_module = main_func.__module__ + if func_module == "__main__": + return Path(sys.argv[0]).stem + else: + return func_module.split(".")[-1] + + logger.warning("Can't find main function in __main__ module. Using sys.argv[0] as a fallback.") + return Path(sys.argv[0]).stem + + +def stage_init(cfg: DictConfig, keys: list[str]): + """Initializes the stage by logging the configuration and the stage-specific paths. + + Args: + cfg: The global configuration object, which should have a ``cfg.stage_cfg`` attribute containing the + stage specific configuration. + + Returns: The data input directory, stage output directory, and metadata input directory. + """ + logger.info( + f"Running {current_script_name()} with the following configuration:\n{OmegaConf.to_yaml(cfg)}" + ) + + chk_kwargs = {k: OmegaConf.select(cfg, k) for k in keys} + + def chk(x: Path | None) -> str: + if x is None: + return "❌" + return "✅" if x.exists() and str(x) != "" else "❌" + + paths_strs = [ + f" - {k}: {chk(Path(v) if v is not None else None)} " + f"{str(Path(v).resolve()) if v is not None else 'None'}" + for k, v in chk_kwargs.items() + ] + + logger_strs = [ + f"Stage config:\n{OmegaConf.to_yaml(cfg)}", + "Paths: (checkbox indicates if it exists)", + ] + logger.debug("\n".join(logger_strs + paths_strs)) diff --git a/src/MEDS_tabular_automl/xgboost_model.py b/src/MEDS_tabular_automl/xgboost_model.py index ec33c15..f95563f 100644 --- a/src/MEDS_tabular_automl/xgboost_model.py +++ b/src/MEDS_tabular_automl/xgboost_model.py @@ -43,7 +43,7 @@ def __init__(self, cfg: DictConfig, split: str): cfg: The configuration dictionary. split: The data split to use. """ - xgb.DataIter.__init__(self, cache_prefix=Path(cfg.cache_dir)) + xgb.DataIter.__init__(self, cache_prefix=Path(cfg.path.cache_dir)) TabularDataset.__init__(self, cfg=cfg, split=split) self.valid_event_ids, self.labels = self._load_ids_and_labels() # check if the labels are empty @@ -107,7 +107,7 @@ def __init__(self, cfg: DictConfig): """ super().__init__() self.cfg = cfg - self.keep_data_in_memory = cfg.model_params.iterator.keep_data_in_memory + self.keep_data_in_memory = cfg.data_loading_params.keep_data_in_memory self.itrain = None self.ituning = None @@ -131,11 +131,10 @@ def _build(self): def _train(self): """Trains the model.""" self.model = xgb.train( - OmegaConf.to_container(self.cfg.model_params.model), + OmegaConf.to_container(self.cfg.model), self.dtrain, - num_boost_round=self.cfg.model_params.num_boost_round, - early_stopping_rounds=self.cfg.model_params.early_stopping_rounds, - # nthreads=self.cfg.nthreads, + num_boost_round=self.cfg.training_params.num_boost_round, + early_stopping_rounds=self.cfg.training_params.early_stopping_rounds, evals=[(self.dtrain, "train"), (self.dtuning, "tuning")], verbose_eval=0, ) diff --git a/tests/test_configs.py b/tests/test_configs.py index 708d270..d0bbc45 100644 --- a/tests/test_configs.py +++ b/tests/test_configs.py @@ -5,17 +5,14 @@ import subprocess import hydra +import polars as pl import pytest from hydra import compose, initialize -from hydra.core.hydra_config import HydraConfig -from loguru import logger +from omegaconf import DictConfig, OmegaConf from MEDS_tabular_automl.sklearn_model import SklearnModel from MEDS_tabular_automl.xgboost_model import XGBoostModel -logger.disable("MEDS_tabular_automl") -from omegaconf import OmegaConf - def run_command(script: str, args: list[str], hydra_kwargs: dict[str, str], test_name: str): command_parts = [script] + args + [f"{k}={v}" for k, v in hydra_kwargs.items()] @@ -28,46 +25,91 @@ def run_command(script: str, args: list[str], hydra_kwargs: dict[str, str], test def make_config_mutable(cfg): - OmegaConf.set_readonly(cfg, False) - for key in cfg: - if isinstance(cfg[key], OmegaConf): - make_config_mutable(cfg[key]) + if OmegaConf.is_config(cfg): + OmegaConf.set_readonly(cfg, False) + for key in cfg.keys(): + print(key) + # try: + cfg[key] = make_config_mutable(cfg[key]) + # except: + # import pdb; pdb.set_trace() + return cfg + # elif isinstance(cfg, list): + # return [make_config_mutable(item) for item in cfg] + # elif isinstance(cfg, dict): + # return {key: make_config_mutable(value) for key, value in cfg.items()} + else: + return cfg @pytest.mark.parametrize( - "model", - ["xgboost", "sgd_classifier", "knn_classifier", "logistic_regression", "random_forest_classifier"], + "model_launcher_override", + [ + "xgboost", + "sgd_classifier", + "knn_classifier", + "logistic_regression", + "random_forest_classifier", + "autogluon", + ], ) @pytest.mark.parametrize("imputer", ["default", "mean_imputer", "mode_imputer", "median_imputer"]) @pytest.mark.parametrize("normalization", ["standard_scaler", "max_abs_scaler"]) -def test_model_config(model, imputer, normalization): - MEDS_cohort_dir = "blah" +def test_model_config(model_launcher_override, imputer, normalization, tmp_path): + input_dir = "/foo/" + code_metadata_fp = f"/{str(tmp_path)}/codes.parquet" + model_launcher_config_kwargs = { + "input_dir": input_dir, + "output_dir": "/bar/", + "output_model_dir": "/baz/", + "++tabularization.filtered_code_metadata_fp": code_metadata_fp, + "++tabularization.min_code_inclusion_count": "0", + "task_name": "foo_bar", + } + pl.DataFrame({"code": ["E", "D", "A"], "count": [4, 3, 2]}).write_parquet(code_metadata_fp) + + with initialize(version_base=None, config_path="../src/MEDS_tabular_automl/configs/"): + overrides = [ + f"model_launcher={model_launcher_override}", + f"data_processing_params.imputer={imputer}", + f"data_processing_params.normalization={normalization}", + ] + [f"{k}={v}" for k, v in model_launcher_config_kwargs.items()] + cfg = compose(config_name="launch_model", overrides=overrides, return_hydra_config=True) + + model_launcher = hydra.utils.instantiate(cfg.model_launcher) + match model_launcher_override: + case "xgboost": + assert isinstance( + model_launcher, XGBoostModel + ), "model_launcher should be an instance of XGBoostModel" + case "autogluon": + assert isinstance( + model_launcher, DictConfig + ), "model_launcher should not be a DictConfig for autogluon" + case _: + assert isinstance( + model_launcher, SklearnModel + ), "model_launcher should be an instance of SklearnModel" + assert cfg.tabularization.window_sizes + + +def test_generate_subsets_configs(): + input_dir = "blah" + stderr, stdout_ws = run_command("generate-subsets", ["[30d]"], {}, "generate-subsets window_sizes") + stderr, stdout_agg = run_command("generate-subsets", ["[static/present]"], {}, "generate-subsets aggs") xgboost_config_kwargs = { - "MEDS_cohort_dir": MEDS_cohort_dir, - "output_cohort_dir": "blah", + "input_dir": input_dir, + "output_dir": "blah", "do_overwrite": False, "seed": 1, "hydra.verbose": True, "tqdm": False, "loguru_init": True, "tabularization.min_code_inclusion_count": 1, - "tabularization.window_sizes": "[30d,365d,full]", - "tabularization._resolved_codes": "[test,test2]", + "tabularization.window_sizes": f"{stdout_ws.strip()}", } - with initialize( - version_base=None, config_path="../src/MEDS_tabular_automl/configs/" - ): # path to config.yaml - overrides = [f"model={model}", f"imputer={imputer}", f"normalization={normalization}"] + [ - f"{k}={v}" for k, v in xgboost_config_kwargs.items() - ] - cfg = compose( - config_name="launch_model", overrides=overrides, return_hydra_config=True - ) # config.yaml - - HydraConfig().set_config(cfg) - # make_config_mutable(cfg) - expected_model_class = XGBoostModel if model == "xgboost" else SklearnModel - model = hydra.utils.instantiate(cfg.model_target) - assert isinstance(model, expected_model_class) - # assert cfg.tabularization.window_sizes + with initialize(version_base=None, config_path="../src/MEDS_tabular_automl/configs/"): + overrides = [f"{k}={v}" for k, v in xgboost_config_kwargs.items()] + cfg = compose(config_name="launch_model", overrides=overrides) + assert cfg.tabularization.window_sizes diff --git a/tests/test_integration.py b/tests/test_integration.py index 055070c..d231be1 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -13,6 +13,7 @@ CODE_COLS, EXPECTED_STATIC_FILES, MEDS_OUTPUTS, + NUM_SHARDS, SPLITS_JSON, STATIC_FIRST_COLS, STATIC_PRESENT_COLS, @@ -43,12 +44,14 @@ def run_command(script: str, args: list[str], hydra_kwargs: dict[str, str], test def test_integration(tmp_path): # Step 0: Setup Environment - MEDS_cohort_dir = Path(tmp_path) / "MEDS_cohort_dir" - output_cohort_dir = Path(tmp_path) / "output_cohort_dir" + input_dir = Path(tmp_path) / "input_dir" + output_dir = Path(tmp_path) / "output_dir" + input_label_dir = Path(tmp_path) / "label_dir" + output_model_dir = Path(tmp_path) / "output_model_dir" shared_config = { - "MEDS_cohort_dir": str(MEDS_cohort_dir.resolve()), - "output_cohort_dir": str(output_cohort_dir.resolve()), + "input_dir": str(input_dir.resolve()), + "output_dir": str(output_dir.resolve()), "do_overwrite": False, "seed": 1, "hydra.verbose": True, @@ -58,35 +61,34 @@ def test_integration(tmp_path): describe_codes_config = {**shared_config} - with initialize( - version_base=None, config_path="../src/MEDS_tabular_automl/configs/" - ): # path to config.yaml + with initialize(version_base=None, config_path="../src/MEDS_tabular_automl/configs/"): overrides = [f"{k}={v}" for k, v in describe_codes_config.items()] - cfg = compose(config_name="describe_codes", overrides=overrides) # config.yaml + cfg = compose(config_name="describe_codes", overrides=overrides) # Create the directories - (output_cohort_dir / "data").mkdir(parents=True, exist_ok=True) + (output_dir).mkdir(parents=True, exist_ok=True) # Store MEDS outputs all_data = [] for split, data in MEDS_OUTPUTS.items(): - file_path = output_cohort_dir / "data" / f"{split}.parquet" - file_path.parent.mkdir(exist_ok=True) + file_path = input_dir / f"{split}.parquet" + file_path.parent.mkdir(exist_ok=True, parents=True) df = pl.read_csv(StringIO(data)).with_columns(pl.col("time").str.to_datetime("%Y-%m-%dT%H:%M:%S%.f")) df.write_parquet(file_path) all_data.append(df) + assert file_path.exists() all_data = pl.concat(all_data, how="diagonal_relaxed").sort(by=["subject_id", "time"]) # Check the files are not empty meds_files = list_subdir_files(Path(cfg.input_dir), "parquet") assert ( - len(list_subdir_files(Path(cfg.input_dir).parent, "parquet")) == 4 + len(list_subdir_files(Path(cfg.input_dir), "parquet")) == 4 ), "MEDS train split Data Files Should be 4!" for f in meds_files: assert pl.read_parquet(f).shape[0] > 0, "MEDS Data Tabular Dataframe Should not be Empty!" split_json = json.load(StringIO(SPLITS_JSON)) - splits_fp = output_cohort_dir / ".shards.json" + splits_fp = input_dir / ".shards.json" json.dump(split_json, splits_fp.open("w")) # Step 1: Run the describe_codes script @@ -96,6 +98,7 @@ def test_integration(tmp_path): describe_codes_config, "describe_codes", ) + assert Path(cfg.output_filepath).is_file() feature_columns = get_feature_columns(cfg.output_filepath) @@ -106,7 +109,7 @@ def test_integration(tmp_path): assert get_feature_names(value_agg, feature_columns) == sorted(VALUE_COLS) # Step 2: Run the static data tabularization script - tabularize_config = { + tabularize_static_config = { **shared_config, "tabularization.min_code_inclusion_count": 1, "tabularization.window_sizes": "[30d,365d,full]", @@ -114,17 +117,17 @@ def test_integration(tmp_path): stderr, stdout = run_command( "meds-tab-tabularize-static", [], - tabularize_config, + tabularize_static_config, "tabularization", ) - with initialize( - version_base=None, config_path="../src/MEDS_tabular_automl/configs/" - ): # path to config.yaml - overrides = [f"{k}={v}" for k, v in tabularize_config.items()] - cfg = compose(config_name="tabularization", overrides=overrides) # config.yaml - - output_files = list(Path(cfg.output_dir).glob("**/static/**/*.npz")) - actual_files = [get_shard_prefix(Path(cfg.output_dir), each) + ".npz" for each in output_files] + with initialize(version_base=None, config_path="../src/MEDS_tabular_automl/configs/"): + overrides = [f"{k}={v}" for k, v in tabularize_static_config.items()] + cfg = compose(config_name="tabularization", overrides=overrides) + + output_dir = Path(cfg.output_dir) / "tabularize" + + output_files = list(output_dir.glob("**/static/**/*.npz")) + actual_files = [get_shard_prefix(output_dir, each) + ".npz" for each in output_files] assert set(actual_files) == set(EXPECTED_STATIC_FILES) # Check the files are not empty for f in output_files: @@ -168,11 +171,9 @@ def test_integration(tmp_path): ) # confirm summary files exist: - output_files = list_subdir_files(cfg.output_dir, "npz") + output_files = list_subdir_files(str(output_dir.resolve()), "npz") actual_files = [ - get_shard_prefix(Path(cfg.output_dir), each) + ".npz" - for each in output_files - if "none/static" not in str(each) + get_shard_prefix(output_dir, each) + ".npz" for each in output_files if "none/static" not in str(each) ] assert len(actual_files) > 0 for f in output_files: @@ -194,18 +195,36 @@ def test_integration(tmp_path): assert ts_matrix.shape[0] == expected_num_rows, ( f"Time-Series Data matrix Should have {expected_num_rows}" f" rows but has {ts_matrix.shape[0]}!" ) + output_files = list_subdir_files(str(Path(cfg.output_tabularized_dir).resolve()), "npz") + for split in split_json.keys(): + for window in cfg.tabularization.window_sizes: + for agg in cfg.tabularization.aggs: + if agg.startswith("static"): + if window != cfg.tabularization.window_sizes[0]: + continue + expected_fp = Path(cfg.output_tabularized_dir) / split / "none" / f"{agg}.npz" + else: + expected_fp = Path(cfg.output_tabularized_dir) / split / window / f"{agg}.npz" + assert expected_fp in output_files, f"Missing {expected_fp}" + expected_num_time_tabs = ( + NUM_SHARDS * len(cfg.tabularization.window_sizes) * (len(cfg.tabularization.aggs) - 2) + ) + expected_num_static_tabs = NUM_SHARDS * 2 + assert len(list_subdir_files(cfg.output_dir, "npz")) == expected_num_time_tabs + expected_num_static_tabs + # Step 4: Run the task_specific_caching script cache_config = { **shared_config, "tabularization.min_code_inclusion_count": 1, "tabularization.window_sizes": "[30d,365d,full]", + "task_name": "test_task", + "input_label_dir": str(input_label_dir.resolve()), } - with initialize( - version_base=None, config_path="../src/MEDS_tabular_automl/configs/" - ): # path to config.yaml + with initialize(version_base=None, config_path="../src/MEDS_tabular_automl/configs/"): overrides = [f"{k}={v}" for k, v in cache_config.items()] - cfg = compose(config_name="task_specific_caching", overrides=overrides) # config.yaml + cfg = compose(config_name="task_specific_caching", overrides=overrides) + # Create fake labels df = get_unique_time_events_df(get_events_df(all_data.lazy(), feature_columns)).collect() pseudo_labels = pl.Series(([0, 1] * df.shape[0])[: df.shape[0]]) df = df.with_columns(pl.Series(name="boolean_value", values=pseudo_labels)) @@ -229,17 +248,54 @@ def test_integration(tmp_path): cache_config, "task_specific_caching", ) + for split in split_json.keys(): + for window in cfg.tabularization.window_sizes: + for agg in cfg.tabularization.aggs: + if agg.startswith("static"): + if window != cfg.tabularization.window_sizes[0]: + continue + expected_fp = Path(cfg.output_tabularized_cache_dir) / split / "none" / f"{agg}.npz" + else: + expected_fp = Path(cfg.output_tabularized_cache_dir) / split / window / f"{agg}.npz" + output_files = list_subdir_files(str(Path(cfg.output_tabularized_cache_dir).resolve()), "npz") + assert expected_fp in output_files, f"Missing {expected_fp}" + [each for each in output_files if "0/30d" in str(each) and "code/count" in str(each)] + assert ( + len(list_subdir_files(cfg.output_tabularized_cache_dir, "npz")) + == expected_num_time_tabs + expected_num_static_tabs + ) + stderr, stdout = run_command( - "meds-tab-model", + "meds-tab-cache-task", [ "--multirun", - f"tabularization.window_sizes={stdout_ws.strip()}", f"tabularization.aggs={stdout_agg.strip()}", - "hydra.sweeper.n_jobs=5", - "hydra.sweeper.n_trials=10", ], cache_config, - "xgboost-model", + "task_specific_caching", ) - assert "The best model can be found at" in stderr - assert "Performance of best model:" in stderr + + for model in [ + "xgboost", + "knn_classifier", + "logistic_regression", + "random_forest_classifier", + "sgd_classifier", + ]: + model_config = { + **shared_config, + "tabularization.min_code_inclusion_count": 1, + "tabularization.window_sizes": "[30d,365d,full]", + "task_name": "test_task", + "output_model_dir": str(output_model_dir.resolve()), + "model_launcher": model, + "hydra.sweeper.n_trials": 1, + } + overrides = [f"tabularization.aggs={stdout_agg.strip()}"] + if model == "autogluon": + script = "meds-tab-autogluon" + else: + script = "meds-tab-model" + overrides = ["--multirun"] + overrides + + stderr, stdout = run_command(script, overrides, model_config, f"launch_model_{model}") diff --git a/tests/test_tabularize.py b/tests/test_tabularize.py index d110121..006252d 100644 --- a/tests/test_tabularize.py +++ b/tests/test_tabularize.py @@ -2,16 +2,14 @@ root = rootutils.setup_root(__file__, dotenv=True, pythonpath=True, cwd=True) -import importlib.util +import importlib import json -import os -import subprocess +import shutil from io import StringIO from pathlib import Path import polars as pl from hydra import compose, initialize -from hydra.core.hydra_config import HydraConfig from loguru import logger from MEDS_tabular_automl.describe_codes import get_feature_columns @@ -150,12 +148,14 @@ def test_tabularize(tmp_path): - MEDS_cohort_dir = Path(tmp_path) / "MEDS_cohort_dir" - output_cohort_dir = Path(tmp_path) / "output_cohort_dir" + input_dir = Path(tmp_path) / "input_dir" + output_dir = Path(tmp_path) / "output_dir" + input_label_dir = Path(tmp_path) / "label_dir" + output_model_dir = Path(tmp_path) / "output_model_dir" shared_config = { - "MEDS_cohort_dir": str(MEDS_cohort_dir.resolve()), - "output_cohort_dir": str(output_cohort_dir.resolve()), + "input_dir": str(input_dir.resolve()), + "output_dir": str(output_dir.resolve()), "do_overwrite": False, "seed": 1, "hydra.verbose": True, @@ -165,35 +165,34 @@ def test_tabularize(tmp_path): describe_codes_config = {**shared_config} - with initialize( - version_base=None, config_path="../src/MEDS_tabular_automl/configs/" - ): # path to config.yaml + with initialize(version_base=None, config_path="../src/MEDS_tabular_automl/configs/"): overrides = [f"{k}={v}" for k, v in describe_codes_config.items()] - cfg = compose(config_name="describe_codes", overrides=overrides) # config.yaml + cfg = compose(config_name="describe_codes", overrides=overrides) # Create the directories - (output_cohort_dir / "data").mkdir(parents=True, exist_ok=True) + (output_dir).mkdir(parents=True, exist_ok=True) # Store MEDS outputs all_data = [] for split, data in MEDS_OUTPUTS.items(): - file_path = output_cohort_dir / "data" / f"{split}.parquet" - file_path.parent.mkdir(exist_ok=True) + file_path = input_dir / f"{split}.parquet" + file_path.parent.mkdir(exist_ok=True, parents=True) df = pl.read_csv(StringIO(data)).with_columns(pl.col("time").str.to_datetime("%Y-%m-%dT%H:%M:%S%.f")) df.write_parquet(file_path) all_data.append(df) + assert file_path.exists() all_data = pl.concat(all_data, how="diagonal_relaxed").sort(by=["subject_id", "time"]) # Check the files are not empty meds_files = list_subdir_files(Path(cfg.input_dir), "parquet") assert ( - len(list_subdir_files(Path(cfg.input_dir).parent, "parquet")) == 4 + len(list_subdir_files(Path(cfg.input_dir), "parquet")) == 4 ), "MEDS train split Data Files Should be 4!" for f in meds_files: assert pl.read_parquet(f).shape[0] > 0, "MEDS Data Tabular Dataframe Should not be Empty!" split_json = json.load(StringIO(SPLITS_JSON)) - splits_fp = output_cohort_dir / ".shards.json" + splits_fp = input_dir / ".shards.json" json.dump(split_json, splits_fp.open("w")) # Step 1: Describe Codes - compute code frequencies describe_codes.main(cfg) @@ -214,14 +213,12 @@ def test_tabularize(tmp_path): "tabularization.window_sizes": "[30d,365d,full]", } - with initialize( - version_base=None, config_path="../src/MEDS_tabular_automl/configs/" - ): # path to config.yaml + with initialize(version_base=None, config_path="../src/MEDS_tabular_automl/configs/"): overrides = [f"{k}={v}" for k, v in tabularize_static_config.items()] - cfg = compose(config_name="tabularization", overrides=overrides) # config.yaml + cfg = compose(config_name="tabularization", overrides=overrides) tabularize_static.main(cfg) - output_dir = Path(cfg.output_cohort_dir) / "tabularize" + output_dir = Path(cfg.output_dir) / "tabularize" output_files = list(output_dir.glob("**/static/**/*.npz")) actual_files = [get_shard_prefix(output_dir, each) + ".npz" for each in output_files] @@ -280,35 +277,34 @@ def test_tabularize(tmp_path): assert ts_matrix.shape[0] == expected_num_rows, ( f"Time-Series Data matrix Should have {expected_num_rows}" f" rows but has {ts_matrix.shape[0]}!" ) - output_files = list_subdir_files(str(output_dir.resolve()), "npz") + output_files = list_subdir_files(str(Path(cfg.output_tabularized_dir).resolve()), "npz") for split in split_json.keys(): for window in cfg.tabularization.window_sizes: for agg in cfg.tabularization.aggs: if agg.startswith("static"): if window != cfg.tabularization.window_sizes[0]: continue - expected_fp = Path(cfg.output_dir) / split / "none" / f"{agg}.npz" + expected_fp = Path(cfg.output_tabularized_dir) / split / "none" / f"{agg}.npz" else: - expected_fp = Path(cfg.output_dir) / split / window / f"{agg}.npz" + expected_fp = Path(cfg.output_tabularized_dir) / split / window / f"{agg}.npz" assert expected_fp in output_files, f"Missing {expected_fp}" expected_num_time_tabs = ( NUM_SHARDS * len(cfg.tabularization.window_sizes) * (len(cfg.tabularization.aggs) - 2) ) expected_num_static_tabs = NUM_SHARDS * 2 assert len(list_subdir_files(cfg.output_dir, "npz")) == expected_num_time_tabs + expected_num_static_tabs - cfg.output_dir # Step 3: Cache Task data cache_config = { **shared_config, "tabularization.min_code_inclusion_count": 1, "tabularization.window_sizes": "[30d,365d,full]", + "task_name": "test_task", + "input_label_dir": str(input_label_dir.resolve()), } - with initialize( - version_base=None, config_path="../src/MEDS_tabular_automl/configs/" - ): # path to config.yaml + with initialize(version_base=None, config_path="../src/MEDS_tabular_automl/configs/"): overrides = [f"{k}={v}" for k, v in cache_config.items()] - cfg = compose(config_name="task_specific_caching", overrides=overrides) # config.yaml + cfg = compose(config_name="task_specific_caching", overrides=overrides) # Create fake labels df = get_unique_time_events_df(get_events_df(all_data.lazy(), feature_columns)).collect() @@ -327,133 +323,102 @@ def test_tabularize(tmp_path): if agg.startswith("static"): if window != cfg.tabularization.window_sizes[0]: continue - expected_fp = Path(cfg.output_dir) / split / "none" / f"{agg}.npz" + expected_fp = Path(cfg.output_tabularized_cache_dir) / split / "none" / f"{agg}.npz" else: - expected_fp = Path(cfg.output_dir) / split / window / f"{agg}.npz" - output_files = list_subdir_files(str(Path(cfg.output_dir).resolve()), "npz") + expected_fp = Path(cfg.output_tabularized_cache_dir) / split / window / f"{agg}.npz" + output_files = list_subdir_files(str(Path(cfg.output_tabularized_cache_dir).resolve()), "npz") assert expected_fp in output_files, f"Missing {expected_fp}" [each for each in output_files if "0/30d" in str(each) and "code/count" in str(each)] - assert len(list_subdir_files(cfg.output_dir, "npz")) == expected_num_time_tabs + expected_num_static_tabs + assert ( + len(list_subdir_files(cfg.output_tabularized_cache_dir, "npz")) + == expected_num_time_tabs + expected_num_static_tabs + ) - xgboost_config_kwargs = { + xgboost_config = { **shared_config, "tabularization.min_code_inclusion_count": 1, "tabularization.window_sizes": "[30d,365d,full]", + "task_name": "test_task", + "output_model_dir": str(output_model_dir.resolve()), } - with initialize( - version_base=None, config_path="../src/MEDS_tabular_automl/configs/" - ): # path to config.yaml - overrides = ["model=xgboost"] + [f"{k}={v}" for k, v in xgboost_config_kwargs.items()] - cfg = compose( - config_name="launch_model", overrides=overrides, return_hydra_config=True - ) # config.yaml - - output_dir = Path(cfg.output_cohort_dir) / "model" + with initialize(version_base=None, config_path="../src/MEDS_tabular_automl/configs/"): + overrides = ["model_launcher=xgboost"] + [f"{k}={v}" for k, v in xgboost_config.items()] + cfg = compose(config_name="launch_model", overrides=overrides, return_hydra_config=True) - HydraConfig().set_config(cfg) launch_model.main(cfg) - output_files = list(output_dir.glob("**/*.json")) - assert len(output_files) == 1 - log_dir = Path(cfg.model_logging.model_log_dir) - log_csv = list(log_dir.glob("**/*.log")) - assert len(log_csv) == 2 + expected_output_dir = Path(cfg.output_model_dir) + output_files = list(expected_output_dir.glob("**/*.json")) + assert len(output_files) == 2 + + log_dir = Path(cfg.path.model_log_dir) + log_files = list(log_dir.glob("**/*.log")) + assert len(log_files) == 1 + shutil.rmtree(expected_output_dir) - sklearnmodel_config_kwargs = { + sklearnmodel_config = { **shared_config, "tabularization.min_code_inclusion_count": 1, "tabularization.window_sizes": "[30d,365d,full]", + "task_name": "test_task", + "output_model_dir": str(output_model_dir.resolve()), } - with initialize( - version_base=None, config_path="../src/MEDS_tabular_automl/configs/" - ): # path to config.yaml - overrides = ["model=sgd_classifier"] + [f"{k}={v}" for k, v in sklearnmodel_config_kwargs.items()] - cfg = compose(config_name="launch_model", overrides=overrides) # config.yaml - - output_dir = Path(cfg.output_cohort_dir) / "model" + with initialize(version_base=None, config_path="../src/MEDS_tabular_automl/configs/"): + overrides = ["model_launcher=sgd_classifier"] + [f"{k}={v}" for k, v in sklearnmodel_config.items()] + cfg = compose(config_name="launch_model", overrides=overrides) launch_model.main(cfg) - output_files = list(output_dir.glob("**/*.pkl")) + + expected_output_dir = Path(cfg.output_model_dir) + output_files = list(expected_output_dir.glob("**/*.pkl")) assert len(output_files) == 1 + shutil.rmtree(expected_output_dir) - sklearnmodel_config_kwargs = { + sklearnmodel_config = { **shared_config, "tabularization.min_code_inclusion_count": 1, "tabularization.window_sizes": "[30d,365d,full]", - "model_params.iterator.keep_data_in_memory": False, - "model_saving.model_dir": "${output_cohort_dir}/model_online/model_${now:%Y-%m-%d_%H-%M-%S}", + "data_loading_params.keep_data_in_memory": False, + "task_name": "test_task", + "output_model_dir": str(output_model_dir.resolve()), } - with initialize( - version_base=None, config_path="../src/MEDS_tabular_automl/configs/" - ): # path to config.yaml - overrides = ["model=sgd_classifier"] + [f"{k}={v}" for k, v in sklearnmodel_config_kwargs.items()] - cfg = compose(config_name="launch_model", overrides=overrides) # config.yaml + with initialize(version_base=None, config_path="../src/MEDS_tabular_automl/configs/"): + overrides = ["model_launcher=sgd_classifier"] + [f"{k}={v}" for k, v in sklearnmodel_config.items()] + cfg = compose(config_name="launch_model", overrides=overrides) + assert not cfg.data_loading_params.keep_data_in_memory + assert cfg.model_launcher.data_loading_params.binarize_task - output_dir = Path(cfg.output_cohort_dir) / "model_online" + output_dir = Path(cfg.output_dir) / "model_online" launch_model.main(cfg) - output_files = list(output_dir.glob("**/*.pkl")) + + expected_output_dir = Path(cfg.output_model_dir) + output_files = list(expected_output_dir.glob("**/*.pkl")) assert len(output_files) == 1 + shutil.rmtree(expected_output_dir) if importlib.util.find_spec("autogluon") is not None: import autogluon as ag from MEDS_tabular_automl.scripts import launch_autogluon - autogluon_config_kwargs = { + autogluon_config = { **shared_config, "tabularization.min_code_inclusion_count": 1, "tabularization.window_sizes": "[30d,365d,full]", - "model_params.iterator.keep_data_in_memory": False, - "model_saving.model_dir": "${output_cohort_dir}/model_online/model_${now:%Y-%m-%d_%H-%M-%S}", + "task_name": "test_task", + "output_model_dir": str(output_model_dir.resolve()), } - with initialize( - version_base=None, config_path="../src/MEDS_tabular_automl/configs/" - ): # path to config.yaml - overrides = [f"{k}={v}" for k, v in autogluon_config_kwargs.items()] - cfg = compose(config_name="launch_autogluon", overrides=overrides) # config.yaml - - output_dir = Path(cfg.output_cohort_dir) / "model_online" + with initialize(version_base=None, config_path="../src/MEDS_tabular_automl/configs/"): + overrides = ["model_launcher=autogluon"] + [f"{k}={v}" for k, v in autogluon_config.items()] + cfg = compose(config_name="launch_model", overrides=overrides) launch_autogluon.main(cfg) - output_files = list(output_dir.glob("*")) - most_recent_file = max(output_files, key=os.path.getmtime) - ag.tabular.TabularPredictor.load(most_recent_file) - - -def run_command(script: str, args: list[str], hydra_kwargs: dict[str, str], test_name: str): - command_parts = [script] + args + [f"{k}={v}" for k, v in hydra_kwargs.items()] - command_out = subprocess.run(" ".join(command_parts), shell=True, capture_output=True) - stderr = command_out.stderr.decode() - stdout = command_out.stdout.decode() - if command_out.returncode != 0: - raise AssertionError(f"{test_name} failed!\nstdout:\n{stdout}\nstderr:\n{stderr}") - return stderr, stdout - - -def test_xgboost_config(): - MEDS_cohort_dir = "blah" - stderr, stdout_ws = run_command("generate-subsets", ["[30d]"], {}, "generate-subsets window_sizes") - stderr, stdout_agg = run_command("generate-subsets", ["[static/present]"], {}, "generate-subsets aggs") - xgboost_config_kwargs = { - "MEDS_cohort_dir": MEDS_cohort_dir, - "output_cohort_dir": "blah", - "do_overwrite": False, - "seed": 1, - "hydra.verbose": True, - "tqdm": False, - "loguru_init": True, - "tabularization.min_code_inclusion_count": 1, - "tabularization.window_sizes": f"{stdout_ws.strip()}", - } - with initialize( - version_base=None, config_path="../src/MEDS_tabular_automl/configs/" - ): # path to config.yaml - overrides = [f"{k}={v}" for k, v in xgboost_config_kwargs.items()] - cfg = compose(config_name="launch_model", overrides=overrides) # config.yaml - assert cfg.tabularization.window_sizes + expected_output_filepath = Path(cfg.output_model_dir) / "predictor.pkl" + assert expected_output_filepath.is_file() + ag.tabular.TabularPredictor.load(cfg.output_model_dir)