-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #5 from mmcdermott/clean
Clean
- Loading branch information
Showing
30 changed files
with
3,583 additions
and
55 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,23 +19,21 @@ jobs: | |
- name: Checkout | ||
uses: actions/checkout@v3 | ||
|
||
- name: Set up Python 3.11 | ||
- name: Set up Python 3.12 | ||
uses: actions/setup-python@v3 | ||
with: | ||
python-version: "3.11" | ||
python-version: "3.12" | ||
|
||
- name: Install packages | ||
run: | | ||
pip install -e . | ||
pip install pytest | ||
pip install pytest-cov[toml] | ||
pip install -e .[tests] | ||
#---------------------------------------------- | ||
# run test suite | ||
#---------------------------------------------- | ||
- name: Run tests | ||
run: | | ||
pytest -v --doctest-modules --cov | ||
pytest -v --doctest-modules --cov --ignore=hf_cohort/ | ||
- name: Upload coverage to Codecov | ||
uses: codecov/[email protected] | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,27 +1,38 @@ | ||
[build-system] | ||
requires = ["setuptools>=61.0"] | ||
build-backend = "setuptools.build_meta" | ||
|
||
[project] | ||
name = "MEDS_tabularization" | ||
version = "0.0.1" | ||
authors = [ | ||
{ name="Matthew McDermott", email="[email protected]" }, | ||
{ name="Nassim Oufattole", email="[email protected]" }, | ||
{ name="Teya Bergamaschi", email="[email protected]" }, | ||
] | ||
description = "TODO" | ||
description = "Scalable Tabularization of MEDS format Time-Series data" | ||
readme = "README.md" | ||
requires-python = ">=3.12" | ||
classifiers = [ | ||
"Programming Language :: Python :: 3", | ||
"License :: OSI Approved :: MIT License", | ||
"Operating System :: OS Independent", | ||
] | ||
dependencies = ["polars", "pyarrow", "loguru", "hydra-core", "numpy"] | ||
dependencies = ["polars", "pyarrow", "loguru", "hydra-core", "numpy", "scipy", "pandas", "tqdm", "xgboost", "scikit-learn", "hydra-optuna-sweeper", "hydra-joblib-launcher", "ml-mixins"] | ||
|
||
[project.scripts] | ||
meds-tab-describe = "MEDS_tabular_automl.scripts.describe_codes:main" | ||
meds-tab-tabularize-static = "MEDS_tabular_automl.scripts.tabularize_static:main" | ||
meds-tab-tabularize-time-series = "MEDS_tabular_automl.scripts.tabularize_time_series:main" | ||
meds-tab-cache-task = "MEDS_tabular_automl.scripts.cache_task:main" | ||
meds-tab-xgboost = "MEDS_tabular_automl.scripts.launch_xgboost:main" | ||
meds-tab-xgboost-sweep = "MEDS_tabular_automl.scripts.sweep_xgboost:main" | ||
|
||
[project.optional-dependencies] | ||
dev = ["pre-commit"] | ||
tests = ["pytest", "pytest-cov", "rootutils"] | ||
profiling = ["mprofile", "matplotlib"] | ||
|
||
[build-system] | ||
requires = ["setuptools>=61.0", "setuptools-scm>=8.0", "wheel"] | ||
build-backend = "setuptools.build_meta" | ||
|
||
[project.urls] | ||
Homepage = "https://github.com/mmcdermott/MEDS_polars_functions" | ||
Issues = "https://github.com/mmcdermott/MEDS_polars_functions/issues" | ||
Homepage = "https://github.com/mmcdermott/MEDS_Tabular_AutoML" | ||
Issues = "https://github.com/mmcdermott/MEDS_Tabular_AutoML/issues" |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
MEDS_cohort_dir: ??? | ||
do_overwrite: False | ||
seed: 1 | ||
tqdm: False | ||
worker: 0 | ||
loguru_init: False | ||
|
||
log_dir: ${output_dir}/.logs/ | ||
|
||
hydra: | ||
verbose: False | ||
job: | ||
name: MEDS_TAB_${name}_${worker}_{now:%Y-%m-%d_%H-%M-%S} | ||
sweep: | ||
dir: ${log_dir} | ||
run: | ||
dir: ${log_dir} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
defaults: | ||
- default | ||
- _self_ | ||
|
||
# split we wish to get metadata for | ||
split: train | ||
# Raw data, must have a subdirectory "train" with the training data split | ||
input_dir: ${MEDS_cohort_dir}/final_cohort/${split} | ||
# Where to store output code frequency data | ||
cache_dir: ${MEDS_cohort_dir}/.cache | ||
output_dir: ${MEDS_cohort_dir} | ||
output_filepath: ${output_dir}/code_metadata.parquet | ||
|
||
name: describe_codes |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
defaults: | ||
- default | ||
- tabularization: default | ||
- _self_ | ||
|
||
task_name: task | ||
# min code frequency used for modeling, can potentially sweep over different values. | ||
modeling_min_code_freq: 10 | ||
|
||
# Task cached data dir | ||
input_dir: ${MEDS_cohort_dir}/${task_name}/task_cache | ||
# Directory with task labels | ||
input_label_dir: ${MEDS_cohort_dir}/${task_name}/labels | ||
# Where to output the model and cached data | ||
output_dir: ${MEDS_cohort_dir}/model/model_${now:%Y-%m-%d_%H-%M-%S} | ||
output_filepath: ${output_dir}/model_metadata.parquet | ||
cache_dir: ${MEDS_cohort_dir}/.cache | ||
|
||
# Model parameters | ||
model_params: | ||
num_boost_round: 1000 | ||
early_stopping_rounds: 5 | ||
model: | ||
booster: gbtree | ||
device: cpu | ||
nthread: 1 | ||
tree_method: hist | ||
objective: binary:logistic | ||
iterator: | ||
keep_data_in_memory: True | ||
binarize_task: True | ||
|
||
# Define search space for Optuna | ||
optuna: | ||
study_name: xgboost_sweep_${now:%Y-%m-%d_%H-%M-%S} | ||
storage: null | ||
load_if_exists: False | ||
direction: minimize | ||
sampler: null | ||
pruner: null | ||
|
||
n_trials: 10 | ||
n_jobs: 1 | ||
show_progress_bar: False | ||
|
||
params: | ||
suggest_categorical: | ||
window_sizes: ${generate_permutations:${tabularization.window_sizes}} | ||
aggs: ${generate_permutations:${tabularization.aggs}} | ||
suggest_float: | ||
eta: | ||
low: .001 | ||
high: 1 | ||
log: True | ||
lambda: | ||
low: .001 | ||
high: 1 | ||
log: True | ||
alpha: | ||
low: .001 | ||
high: 1 | ||
log: True | ||
subsample: | ||
low: 0.5 | ||
high: 1 | ||
min_child_weight: | ||
low: 1e-2 | ||
high: 100 | ||
suggest_int: | ||
num_boost_round: | ||
low: 10 | ||
high: 1000 | ||
max_depth: | ||
low: 2 | ||
high: 16 | ||
min_code_inclusion_frequency: | ||
low: 10 | ||
high: 1_000_000 | ||
log: True | ||
|
||
name: launch_xgboost |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
defaults: | ||
- default | ||
- tabularization: default | ||
- _self_ | ||
|
||
# Raw data | ||
# Where the code metadata is stored | ||
input_code_metadata_fp: ${MEDS_cohort_dir}/code_metadata.parquet | ||
input_dir: ${MEDS_cohort_dir}/final_cohort | ||
output_dir: ${MEDS_cohort_dir}/tabularize | ||
|
||
name: tabularization |
Empty file.
22 changes: 22 additions & 0 deletions
22
src/MEDS_tabular_automl/configs/tabularization/default.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
# User inputs | ||
allowed_codes: null | ||
min_code_inclusion_frequency: 10 | ||
filtered_code_metadata_fp: ${MEDS_cohort_dir}/tabularized_code_metadata.parquet | ||
window_sizes: | ||
- "1d" | ||
- "7d" | ||
- "30d" | ||
- "365d" | ||
- "full" | ||
aggs: | ||
- "static/present" | ||
- "static/first" | ||
- "code/count" | ||
- "value/count" | ||
- "value/sum" | ||
- "value/sum_sqd" | ||
- "value/min" | ||
- "value/max" | ||
|
||
# Resolved inputs | ||
_resolved_codes: ${filter_to_codes:${tabularization.allowed_codes},${tabularization.min_code_inclusion_frequency},${tabularization.filtered_code_metadata_fp}} |
14 changes: 14 additions & 0 deletions
14
src/MEDS_tabular_automl/configs/task_specific_caching.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
defaults: | ||
- default | ||
- tabularization: default | ||
- _self_ | ||
task_name: task | ||
|
||
# Tabularized Data | ||
input_dir: ${MEDS_cohort_dir}/tabularize | ||
# Where the labels are stored, with columns patient_id, timestamp, label | ||
input_label_dir: ${MEDS_cohort_dir}/${task_name}/labels | ||
# Where to output the task specific tabularized data | ||
output_dir: ${MEDS_cohort_dir}/${task_name}/task_cache | ||
|
||
name: task_specific_caching |
Oops, something went wrong.