diff --git a/.github/workflows/publish-to-pypi.yml b/.github/workflows/publish-to-pypi.yml
index d86806f..7a68958 100644
--- a/.github/workflows/publish-to-pypi.yml
+++ b/.github/workflows/publish-to-pypi.yml
@@ -36,7 +36,7 @@ jobs:
     runs-on: ubuntu-latest
     environment:
       name: pypi
-      url: https://pypi.org/p/<package-name> # Replace <package-name> with your PyPI project name
+      url: https://pypi.org/p/meds-tab # Replace <package-name> with your PyPI project name
     permissions:
       id-token: write # IMPORTANT: mandatory for trusted publishing
 
@@ -91,27 +91,3 @@ jobs:
           gh release upload
           '${{ github.ref_name }}' dist/**
           --repo '${{ github.repository }}'
-
-  publish-to-testpypi:
-    name: Publish Python 🐍 distribution 📦 to TestPyPI
-    needs:
-      - build
-    runs-on: ubuntu-latest
-
-    environment:
-      name: testpypi
-      url: https://test.pypi.org/p/<package-name>
-
-    permissions:
-      id-token: write # IMPORTANT: mandatory for trusted publishing
-
-    steps:
-      - name: Download all the dists
-        uses: actions/download-artifact@v3
-        with:
-          name: python-package-distributions
-          path: dist/
-      - name: Publish distribution 📦 to TestPyPI
-        uses: pypa/gh-action-pypi-publish@release/v1
-        with:
-          repository-url: https://test.pypi.org/legacy/
diff --git a/README.md b/README.md
index 8900e41..ac704f8 100644
--- a/README.md
+++ b/README.md
@@ -84,12 +84,12 @@ By following these steps, you can seamlessly transform your dataset, define nece
 
    ```console
    # Re-shard pipeline
-   # $MIMICIV_MEDS_DIR is the directory containing the input, MEDS v0.3 formatted MIMIC-IV data
+   # $MIMICIV_input_dir is the directory containing the input, MEDS v0.3 formatted MIMIC-IV data
    # $MEDS_TAB_COHORT_DIR is the directory where the re-sharded MEDS dataset will be stored, and where your model
    # will store cached files during processing by default.
    # $N_PATIENTS_PER_SHARD is the number of patients per shard you want to use.
    MEDS_transform-reshard_to_split \
-       input_dir="$MIMICIV_MEDS_DIR" \
+       input_dir="$MIMICIV_input_dir" \
        cohort_dir="$MEDS_TAB_COHORT_DIR" \
        'stages=["reshard_to_split"]' \
        stage="reshard_to_split" \
@@ -103,14 +103,14 @@ By following these steps, you can seamlessly transform your dataset, define nece
    - static codes (codes without timestamps)
    - static numerical codes (codes without timestamps but with numerical values).
 
-   This script further caches feature names and frequencies in a dataset stored in a `code_metadata.parquet` file within the `MEDS_cohort_dir` argument specified as a hydra-style command line argument.
+   This script further caches feature names and frequencies in a dataset stored in a `code_metadata.parquet` file within the `input_dir` argument specified as a hydra-style command line argument.
 
 2. **`meds-tab-tabularize-static`**: Filters and processes the dataset based on the frequency of codes, generating a tabular vector for each patient at each timestamp in the shards. Each row corresponds to a unique `subject_id` and `timestamp` combination, thus rows are duplicated across multiple timestamps for the same patient.
 
    **Example: Tabularizing static data** with the minimum code frequency of 10, window sizes of `[1d, 30d,  365d, full]`, and value aggregation methods of `[static/present, static/first, code/count, value/count, value/sum, value/sum_sqd, value/min, value/max]`
 
    ```console
-   meds-tab-tabularize-static MEDS_cohort_dir="path_to_data" \
+   meds-tab-tabularize-static input_dir="path_to_data" \
                                tabularization.min_code_inclusion_frequency=10 \
                                tabularization.window_sizes=[1d,30d,365d,full] \
                                do_overwrite=False \
@@ -127,19 +127,19 @@ By following these steps, you can seamlessly transform your dataset, define nece
    meds-tab-tabularize-time-series --multirun \
       worker="range(0,$N_PARALLEL_WORKERS)" \
       hydra/launcher=joblib \
-      MEDS_cohort_dir="path_to_data" \
+      input_dir="path_to_data" \
       tabularization.min_code_inclusion_frequency=10 \
       do_overwrite=False \
       tabularization.window_sizes=[1d,30d,365d,full] \
       tabularization.aggs=[static/present,static/first,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max]
    ```
 
-4. **`meds-tab-cache-task`**: Aligns task-specific labels with the nearest prior event in the tabularized data. It requires a labeled dataset directory with three columns (`subject_id`, `timestamp`, `label`) structured similarly to the `MEDS_cohort_dir`.
+4. **`meds-tab-cache-task`**: Aligns task-specific labels with the nearest prior event in the tabularized data. It requires a labeled dataset directory with three columns (`subject_id`, `timestamp`, `label`) structured similarly to the `input_dir`.
 
    **Example: Align tabularized data** for a specific task `$TASK` and labels that has pulled from [ACES](https://github.com/justin13601/ACES)
 
    ```console
-   meds-tab-cache-task MEDS_cohort_dir="path_to_data" \
+   meds-tab-cache-task input_dir="path_to_data" \
       task_name=$TASK \
       tabularization.min_code_inclusion_frequency=10 \
       do_overwrite=False \
@@ -151,7 +151,7 @@ By following these steps, you can seamlessly transform your dataset, define nece
 
    ```console
    meds-tab-xgboost --multirun \
-      MEDS_cohort_dir="path_to_data" \
+      input_dir="path_to_data" \
       task_name=$TASK \
       output_dir="output_directory" \
       tabularization.min_code_inclusion_frequency=10 \
@@ -436,7 +436,7 @@ A single XGBoost run was completed to profile time and memory usage. This was do
 
 ```console
 meds-tab-xgboost
-      MEDS_cohort_dir="path_to_data" \
+      input_dir="path_to_data" \
       task_name=$TASK \
       output_dir="output_directory" \
       do_overwrite=False \
@@ -506,7 +506,7 @@ The XGBoost sweep was run using the following command for each `$TASK`:
 
 ```console
 meds-tab-xgboost --multirun \
-      MEDS_cohort_dir="path_to_data" \
+      input_dir="path_to_data" \
       task_name=$TASK \
       output_dir="output_directory" \
       tabularization.window_sizes=$(generate-subsets [1d,30d,365d,full]) \
@@ -529,14 +529,14 @@ The hydra sweeper swept over the parameters:
 
 ```yaml
 params:
-  +model_params.model.eta: tag(log, interval(0.001, 1))
-  +model_params.model.lambda: tag(log, interval(0.001, 1))
-  +model_params.model.alpha: tag(log, interval(0.001, 1))
-  +model_params.model.subsample: interval(0.5, 1)
-  +model_params.model.min_child_weight: interval(1e-2, 100)
-  +model_params.model.max_depth: range(2, 16)
-  model_params.num_boost_round: range(100, 1000)
-  model_params.early_stopping_rounds: range(1, 10)
+  model.eta: tag(log, interval(0.001, 1))
+  model.lambda: tag(log, interval(0.001, 1))
+  model.alpha: tag(log, interval(0.001, 1))
+  model.subsample: interval(0.5, 1)
+  model.min_child_weight: interval(1e-2, 100)
+  model.max_depth: range(2, 16)
+  num_boost_round: range(100, 1000)
+  early_stopping_rounds: range(1, 10)
   tabularization.min_code_inclusion_frequency: tag(log, range(10, 1000000))
 ```
 
diff --git a/docs/source/overview.md b/docs/source/overview.md
index 44f68bf..1d453f0 100644
--- a/docs/source/overview.md
+++ b/docs/source/overview.md
@@ -38,14 +38,14 @@ See [`/tests/test_integration.py`](https://github.com/mmcdermott/MEDS_Tabular_Au
    - static codes (codes without timestamps)
    - static numerical codes (codes without timestamps but with numerical values).
 
-   This script further caches feature names and frequencies in a dataset stored in a `code_metadata.parquet` file within the `MEDS_cohort_dir` argument specified as a hydra-style command line argument.
+   This script further caches feature names and frequencies in a dataset stored in a `code_metadata.parquet` file within the `input_dir` argument specified as a hydra-style command line argument.
 
 2. **`meds-tab-tabularize-static`**: Filters and processes the dataset based on the frequency of codes, generating a tabular vector for each patient at each timestamp in the shards. Each row corresponds to a unique `subject_id` and `timestamp` combination, thus rows are duplicated across multiple timestamps for the same patient.
 
    **Example: Tabularizing static data** with the minimum code frequency of 10, window sizes of `[1d, 30d,  365d, full]`, and value aggregation methods of `[static/present, static/first, code/count, value/count, value/sum, value/sum_sqd, value/min, value/max]`
 
    ```console
-   meds-tab-tabularize-static MEDS_cohort_dir="path_to_data" \
+   meds-tab-tabularize-static input_dir="path_to_data" \
                                tabularization.min_code_inclusion_frequency=10 \
                                tabularization.window_sizes=[1d,30d,365d,full] \
                                do_overwrite=False \
@@ -62,19 +62,19 @@ See [`/tests/test_integration.py`](https://github.com/mmcdermott/MEDS_Tabular_Au
    meds-tab-tabularize-time-series --multirun \
       worker="range(0,$N_PARALLEL_WORKERS)" \
       hydra/launcher=joblib \
-      MEDS_cohort_dir="path_to_data" \
+      input_dir="path_to_data" \
       tabularization.min_code_inclusion_frequency=10 \
       do_overwrite=False \
       tabularization.window_sizes=[1d,30d,365d,full] \
       tabularization.aggs=[static/present,static/first,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max]
    ```
 
-4. **`meds-tab-cache-task`**: Aligns task-specific labels with the nearest prior event in the tabularized data. It requires a labeled dataset directory with three columns (`subject_id`, `timestamp`, `label`) structured similarly to the `MEDS_cohort_dir`.
+4. **`meds-tab-cache-task`**: Aligns task-specific labels with the nearest prior event in the tabularized data. It requires a labeled dataset directory with three columns (`subject_id`, `timestamp`, `label`) structured similarly to the `input_dir`.
 
    **Example: Align tabularized data** for a specific task `$TASK` and labels that has pulled from [ACES](https://github.com/justin13601/ACES)
 
    ```console
-   meds-tab-cache-task MEDS_cohort_dir="path_to_data" \
+   meds-tab-cache-task input_dir="path_to_data" \
       task_name=$TASK \
       tabularization.min_code_inclusion_frequency=10 \
       do_overwrite=False \
@@ -86,7 +86,7 @@ See [`/tests/test_integration.py`](https://github.com/mmcdermott/MEDS_Tabular_Au
 
    ```console
    meds-tab-xgboost --multirun \
-      MEDS_cohort_dir="path_to_data" \
+      input_dir="path_to_data" \
       task_name=$TASK \
       output_dir="output_directory" \
       tabularization.min_code_inclusion_frequency=10 \
diff --git a/docs/source/prediction.md b/docs/source/prediction.md
index 35131d8..18f19c0 100644
--- a/docs/source/prediction.md
+++ b/docs/source/prediction.md
@@ -14,7 +14,7 @@ A single XGBoost run was completed to profile time and memory usage. This was do
 
 ```console
 meds-tab-xgboost
-      MEDS_cohort_dir="path_to_data" \
+      input_dir="path_to_data" \
       task_name=$TASK \
       output_dir="output_directory" \
       do_overwrite=False \
@@ -84,7 +84,7 @@ The XGBoost sweep was run using the following command for each `$TASK`:
 
 ```console
 meds-tab-xgboost --multirun \
-      MEDS_cohort_dir="path_to_data" \
+      input_dir="path_to_data" \
       task_name=$TASK \
       output_dir="output_directory" \
       tabularization.window_sizes=$(generate-permutations [1d,30d,365d,full]) \
@@ -107,14 +107,14 @@ The hydra sweeper swept over the parameters:
 
 ```yaml
 params:
-  +model_params.model.eta: tag(log, interval(0.001, 1))
-  +model_params.model.lambda: tag(log, interval(0.001, 1))
-  +model_params.model.alpha: tag(log, interval(0.001, 1))
-  +model_params.model.subsample: interval(0.5, 1)
-  +model_params.model.min_child_weight: interval(1e-2, 100)
-  +model_params.model.max_depth: range(2, 16)
-  model_params.num_boost_round: range(100, 1000)
-  model_params.early_stopping_rounds: range(1, 10)
+  model.eta: tag(log, interval(0.001, 1))
+  model.lambda: tag(log, interval(0.001, 1))
+  model.alpha: tag(log, interval(0.001, 1))
+  model.subsample: interval(0.5, 1)
+  model.min_child_weight: interval(1e-2, 100)
+  model.max_depth: range(2, 16)
+  num_boost_round: range(100, 1000)
+  early_stopping_rounds: range(1, 10)
   tabularization.min_code_inclusion_frequency: tag(log, range(10, 1000000))
 ```
 
diff --git a/pyproject.toml b/pyproject.toml
index 59b30cf..1b75489 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,6 +19,8 @@ dependencies = [
   "scikit-learn", "hydra-optuna-sweeper", "hydra-joblib-launcher", "ml-mixins", "meds==0.3.3", "meds-transforms==0.0.7",
 ]
 
+[tool.setuptools_scm]
+
 [project.scripts]
 meds-tab-describe = "MEDS_tabular_automl.scripts.describe_codes:main"
 meds-tab-tabularize-static = "MEDS_tabular_automl.scripts.tabularize_static:main"
diff --git a/src/MEDS_tabular_automl/configs/default.yaml b/src/MEDS_tabular_automl/configs/default.yaml
index 82a2164..7d4e392 100644
--- a/src/MEDS_tabular_automl/configs/default.yaml
+++ b/src/MEDS_tabular_automl/configs/default.yaml
@@ -1,13 +1,13 @@
-MEDS_cohort_dir: ???
-output_cohort_dir: ???
+input_dir: ???
+output_dir: ???
 do_overwrite: False
 seed: 1
 tqdm: False
 worker: 0
 loguru_init: False
 
-log_dir: ${output_cohort_dir}/.logs/
-cache_dir: ${output_cohort_dir}/.cache
+log_dir: ${output_dir}/.logs/
+cache_dir: ${output_dir}/.cache
 
 hydra:
   verbose: False
diff --git a/src/MEDS_tabular_automl/configs/describe_codes.yaml b/src/MEDS_tabular_automl/configs/describe_codes.yaml
index ec980bf..007307c 100644
--- a/src/MEDS_tabular_automl/configs/describe_codes.yaml
+++ b/src/MEDS_tabular_automl/configs/describe_codes.yaml
@@ -2,8 +2,7 @@ defaults:
   - default
   - _self_
 
-input_dir: ${output_cohort_dir}/data
 # Where to store output code frequency data
-output_filepath: ${output_cohort_dir}/metadata/codes.parquet
+output_filepath: ${output_dir}/metadata/codes.parquet
 
 name: describe_codes
diff --git a/src/MEDS_tabular_automl/configs/launch_autogluon.yaml b/src/MEDS_tabular_automl/configs/launch_autogluon.yaml
deleted file mode 100644
index 908e79d..0000000
--- a/src/MEDS_tabular_automl/configs/launch_autogluon.yaml
+++ /dev/null
@@ -1,28 +0,0 @@
-defaults:
-  - default
-  - tabularization: default
-  - imputer: default
-  - normalization: default
-  - _self_
-
-task_name: task
-
-# Task cached data dir
-input_dir: ${output_cohort_dir}/${task_name}/task_cache
-# Directory with task labels
-input_label_dir: ${output_cohort_dir}/${task_name}/labels/
-# Where to output the model and cached data
-model_dir: ${output_cohort_dir}/autogluon/autogluon_${now:%Y-%m-%d_%H-%M-%S}
-model_log_dir: ${model_dir}/.logs/
-output_filepath: ${model_dir}
-
-# Model parameters
-model_params:
-  iterator:
-    keep_data_in_memory: True
-    binarize_task: True
-
-log_dir: ${model_dir}/.logs/
-log_filepath: ${log_dir}/log.txt
-
-name: launch_autogluon
diff --git a/src/MEDS_tabular_automl/configs/launch_model.yaml b/src/MEDS_tabular_automl/configs/launch_model.yaml
index 9938cd9..7008acf 100644
--- a/src/MEDS_tabular_automl/configs/launch_model.yaml
+++ b/src/MEDS_tabular_automl/configs/launch_model.yaml
@@ -1,39 +1,28 @@
 defaults:
-  - _self_
   - default
   - tabularization: default
-  - model: xgboost # This can be changed to sgd_classifier or any other model
-  - imputer: default
-  - normalization: default
-  - override hydra/callbacks: evaluation_callback
+  - model_launcher: xgboost
   - override hydra/sweeper: optuna
-  - override hydra/sweeper/sampler: tpe
+  - override hydra/callbacks: evaluation_callback
   - override hydra/launcher: joblib
+  - _self_
 
-task_name: task
+task_name: ???
 
-# Task cached data dir
-input_dir: ${output_cohort_dir}/${task_name}/task_cache
-# Directory with task labels
-input_label_dir: ${output_cohort_dir}/${task_name}/labels/
+# Location of task, split, and shard specific tabularized data
+input_tabularized_cache_dir: ${output_dir}/${task_name}/task_cache
+# Location of  task, split, and shard specific label data
+input_label_cache_dir: ${output_dir}/${task_name}/labels
 # Where to output the model and cached data
-model_saving:
-  model_dir: ${output_cohort_dir}/model/model_${now:%Y-%m-%d_%H-%M-%S}
-  model_file_stem: model
-  model_file_extension: .json
-  delete_below_top_k: -1
-model_logging:
-  model_log_dir: ${model_saving.model_dir}/.logs/
-  performance_log_stem: performance
-  config_log_stem: config
+output_model_dir: ???
+
+delete_below_top_k: -1
 
 name: launch_model
 
 hydra:
-  verbose: False
-  job:
-    name: MEDS_TAB_${name}_${worker}_${now:%Y-%m-%d_%H-%M-%S}
   sweep:
-    dir: ${model_log_dir}
+    dir: ${output_model_dir}/sweeps/{now:%Y-%m-%d-%H-%M-%S}/
+    subdir: "1"
   run:
-    dir: ${model_log_dir}
+    dir: ${path.model_log_dir}
diff --git a/src/MEDS_tabular_automl/configs/model/knn_classifier.yaml b/src/MEDS_tabular_automl/configs/model/knn_classifier.yaml
deleted file mode 100644
index 1ca034a..0000000
--- a/src/MEDS_tabular_automl/configs/model/knn_classifier.yaml
+++ /dev/null
@@ -1,44 +0,0 @@
-# @package _global_
-
-model_target:
-  _target_: MEDS_tabular_automl.sklearn_model.SklearnModel.initialize
-  model_params: ${model_params}
-  input_dir: ${input_dir}
-  input_label_dir: ${input_label_dir}
-  model_dir: ${model_saving.model_dir}
-  model_file_stem: ${model_saving.model_file_stem}
-  model_file_extension: ${model_saving.model_file_extension}
-  log_dir: ${log_dir}
-  cache_dir: ${cache_dir}
-  imputer: ${model_params.iterator.imputer}
-  normalization: ${model_params.iterator.normalization}
-
-model_params:
-  epochs: 20
-  early_stopping_rounds: 5
-  model:
-    _target_: sklearn.neighbors.KNeighborsClassifier
-    weights: "distance"
-    leaf_size: 30
-    p: 2
-    metric: "minkowski"
-  iterator:
-    keep_data_in_memory: True
-    binarize_task: True
-    normalization: ${normalization}
-    imputer: ${imputer}
-
-hydra:
-  sweeper:
-    direction: maximize
-    n_trials: 250
-    n_jobs: 25
-
-    params:
-      model_params.model.n_neighbors: range(1, 20)
-      model_params.model.weights: choice(['uniform', 'distance'])
-      model_params.model.leaf_size: range(10, 50)
-      model_params.model.p: choice([1, 2])
-      model_params.model.metric: choice(['minkowski', 'euclidean', 'manhattan'])
-      model_params.epochs: range(10, 100)
-      model_params.early_stopping_rounds: range(1, 10)
diff --git a/src/MEDS_tabular_automl/configs/model/logistic_regression.yaml b/src/MEDS_tabular_automl/configs/model/logistic_regression.yaml
deleted file mode 100644
index 0f74a7b..0000000
--- a/src/MEDS_tabular_automl/configs/model/logistic_regression.yaml
+++ /dev/null
@@ -1,49 +0,0 @@
-# @package _global_
-
-model_target:
-  _target_: MEDS_tabular_automl.sklearn_model.SklearnModel.initialize
-  model_params: ${model_params}
-  input_dir: ${input_dir}
-  input_label_dir: ${input_label_dir}
-  model_dir: ${model_saving.model_dir}
-  model_file_stem: ${model_saving.model_file_stem}
-  model_file_extension: ${model_saving.model_file_extension}
-  log_dir: ${log_dir}
-  cache_dir: ${cache_dir}
-  imputer: ${model_params.iterator.imputer}
-  normalization: ${model_params.iterator.normalization}
-
-model_params:
-  epochs: 20
-  early_stopping_rounds: 5
-  model:
-    _target_: sklearn.linear_model.LogisticRegression
-    penalty: "l2"
-    dual: false
-    tol: 0.0001
-    C: 1.0
-    fit_intercept: True
-    intercept_scaling: 1
-    class_weight: null
-    random_state: null
-    solver: "lbfgs"
-    max_iter: 100
-
-  iterator:
-    keep_data_in_memory: True
-    binarize_task: True
-    normalization: ${normalization}
-    imputer: ${imputer}
-
-hydra:
-  sweeper:
-    direction: maximize
-    n_trials: 250
-    n_jobs: 25
-
-    params:
-      model_params.model.C: tag(log, interval(1e-6, 1))
-      model_params.model.penalty: choice(['l1', 'l2', 'elasticnet'])
-      model_params.model.solver: choice(['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'])
-      model_params.epochs: range(10, 100)
-      model_params.early_stopping_rounds: range(1, 10)
diff --git a/src/MEDS_tabular_automl/configs/model/random_forest_classifier.yaml b/src/MEDS_tabular_automl/configs/model/random_forest_classifier.yaml
deleted file mode 100644
index 58a9671..0000000
--- a/src/MEDS_tabular_automl/configs/model/random_forest_classifier.yaml
+++ /dev/null
@@ -1,51 +0,0 @@
-# @package _global_
-
-model_target:
-  _target_: MEDS_tabular_automl.sklearn_model.SklearnModel.initialize
-  model_params: ${model_params}
-  input_dir: ${input_dir}
-  input_label_dir: ${input_label_dir}
-  model_dir: ${model_saving.model_dir}
-  model_file_stem: ${model_saving.model_file_stem}
-  model_file_extension: ${model_saving.model_file_extension}
-  log_dir: ${log_dir}
-  cache_dir: ${cache_dir}
-  imputer: ${model_params.iterator.imputer}
-  normalization: ${model_params.iterator.normalization}
-
-model_params:
-  epochs: 20
-  early_stopping_rounds: 5
-  model:
-    _target_: sklearn.ensemble.RandomForestClassifier
-    criterion: "gini"
-    max_depth: null
-    min_samples_split: 2
-    min_samples_leaf: 1
-    min_weight_fraction_leaf: 0.0
-    max_features: "sqrt"
-    max_leaf_nodes: null
-    min_impurity_decrease: 0.0
-    bootstrap: True
-  iterator:
-    keep_data_in_memory: True
-    binarize_task: True
-    normalization: ${normalization}
-    imputer: ${imputer}
-
-hydra:
-  sweeper:
-    direction: maximize
-    n_trials: 250
-    n_jobs: 25
-
-    params:
-      model_params.model.n_estimators: range(50, 300, 50)
-      model_params.model.max_depth: choice([null, 10, 20, 30, 40, 50])
-      model_params.model.min_samples_split: range(2, 11)
-      model_params.model.min_samples_leaf: range(1, 5)
-      model_params.model.max_features: choice(['sqrt', 'log2', null])
-      model_params.model.bootstrap: choice([True, False])
-      model_params.model.criterion: choice(['gini', 'entropy'])
-      model_params.epochs: range(10, 100)
-      model_params.early_stopping_rounds: range(1, 10)
diff --git a/src/MEDS_tabular_automl/configs/model/sgd_classifier.yaml b/src/MEDS_tabular_automl/configs/model/sgd_classifier.yaml
deleted file mode 100644
index 2f2b57f..0000000
--- a/src/MEDS_tabular_automl/configs/model/sgd_classifier.yaml
+++ /dev/null
@@ -1,38 +0,0 @@
-# @package _global_
-
-model_target:
-  _target_: MEDS_tabular_automl.sklearn_model.SklearnModel.initialize
-  model_params: ${model_params}
-  input_dir: ${input_dir}
-  input_label_dir: ${input_label_dir}
-  model_dir: ${model_saving.model_dir}
-  model_file_stem: ${model_saving.model_file_stem}
-  model_file_extension: ${model_saving.model_file_extension}
-  log_dir: ${log_dir}
-  cache_dir: ${cache_dir}
-  imputer: ${model_params.iterator.imputer}
-  normalization: ${model_params.iterator.normalization}
-
-model_params:
-  epochs: 20
-  early_stopping_rounds: 5
-  model:
-    _target_: sklearn.linear_model.SGDClassifier
-    loss: log_loss
-  iterator:
-    keep_data_in_memory: True
-    binarize_task: True
-    normalization: ${normalization}
-    imputer: ${imputer}
-
-hydra:
-  sweeper:
-    direction: maximize
-    n_trials: 250
-    n_jobs: 25
-    params:
-      +model_params.model.alpha: tag(log, interval(1e-6, 1))
-      +model_params.model.l1_ratio: interval(0, 1)
-      +model_params.model.penalty: choice(['l1', 'l2', 'elasticnet'])
-      model_params.epochs: range(10, 100)
-      model_params.early_stopping_rounds: range(1, 10)
diff --git a/src/MEDS_tabular_automl/configs/model/xgboost.yaml b/src/MEDS_tabular_automl/configs/model/xgboost.yaml
deleted file mode 100644
index 793cc29..0000000
--- a/src/MEDS_tabular_automl/configs/model/xgboost.yaml
+++ /dev/null
@@ -1,47 +0,0 @@
-# @package _global_
-
-model_target:
-  _target_: MEDS_tabular_automl.xgboost_model.XGBoostModel.initialize
-  model_params: ${model_params}
-  input_dir: ${input_dir}
-  input_label_dir: ${input_label_dir}
-  model_dir: ${model_saving.model_dir}
-  model_file_stem: ${model_saving.model_file_stem}
-  model_file_extension: ${model_saving.model_file_extension}
-  log_dir: ${log_dir}
-  cache_dir: ${cache_dir}
-  imputer: ${model_params.iterator.imputer}
-  normalization: ${model_params.iterator.normalization}
-  # tabularization: ${tabularization} # Ideally we should define tabularization here, but there is an issue initializing with it's resolvers.
-
-model_params:
-  num_boost_round: 1000
-  early_stopping_rounds: 5
-  model:
-    booster: gbtree
-    device: cpu
-    nthread: 1
-    tree_method: hist
-    objective: binary:logistic
-  iterator:
-    keep_data_in_memory: True
-    binarize_task: True
-    normalization: ${normalization}
-    imputer: ${imputer}
-
-hydra:
-  sweeper:
-    direction: maximize
-    n_trials: 250
-    n_jobs: 25
-
-    params:
-      +model_params.model.eta: tag(log, interval(0.001, 1))
-      +model_params.model.lambda: tag(log, interval(0.001, 1))
-      +model_params.model.alpha: tag(log, interval(0.001, 1))
-      +model_params.model.subsample: interval(0.5, 1)
-      +model_params.model.min_child_weight: interval(1e-2, 100)
-      model_params.num_boost_round: range(100, 1000)
-      model_params.early_stopping_rounds: range(1, 10)
-      +model_params.model.max_depth: range(2, 16)
-      tabularization.min_code_inclusion_frequency: tag(log, range(10, 1000000))
diff --git a/src/MEDS_tabular_automl/configs/model_launcher/__init__.py b/src/MEDS_tabular_automl/configs/model_launcher/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/MEDS_tabular_automl/configs/model_launcher/autogluon.yaml b/src/MEDS_tabular_automl/configs/model_launcher/autogluon.yaml
new file mode 100644
index 0000000..b7d02cd
--- /dev/null
+++ b/src/MEDS_tabular_automl/configs/model_launcher/autogluon.yaml
@@ -0,0 +1,3 @@
+defaults:
+  - default
+  - _self_
diff --git a/src/MEDS_tabular_automl/configs/model_launcher/data_loading_params/__init__.py b/src/MEDS_tabular_automl/configs/model_launcher/data_loading_params/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/MEDS_tabular_automl/configs/model_launcher/data_loading_params/default.yaml b/src/MEDS_tabular_automl/configs/model_launcher/data_loading_params/default.yaml
new file mode 100644
index 0000000..723131f
--- /dev/null
+++ b/src/MEDS_tabular_automl/configs/model_launcher/data_loading_params/default.yaml
@@ -0,0 +1,2 @@
+keep_data_in_memory: True
+binarize_task: True
diff --git a/src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/__init__.py b/src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/default.yaml b/src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/default.yaml
new file mode 100644
index 0000000..5cf0c5b
--- /dev/null
+++ b/src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/default.yaml
@@ -0,0 +1,3 @@
+defaults:
+  - imputer: default
+  - normalization: default
diff --git a/src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/imputer/__init__.py b/src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/imputer/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/MEDS_tabular_automl/configs/imputer/default.yaml b/src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/imputer/default.yaml
similarity index 100%
rename from src/MEDS_tabular_automl/configs/imputer/default.yaml
rename to src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/imputer/default.yaml
diff --git a/src/MEDS_tabular_automl/configs/imputer/mean_imputer.yaml b/src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/imputer/mean_imputer.yaml
similarity index 100%
rename from src/MEDS_tabular_automl/configs/imputer/mean_imputer.yaml
rename to src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/imputer/mean_imputer.yaml
diff --git a/src/MEDS_tabular_automl/configs/imputer/median_imputer.yaml b/src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/imputer/median_imputer.yaml
similarity index 100%
rename from src/MEDS_tabular_automl/configs/imputer/median_imputer.yaml
rename to src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/imputer/median_imputer.yaml
diff --git a/src/MEDS_tabular_automl/configs/imputer/mode_imputer.yaml b/src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/imputer/mode_imputer.yaml
similarity index 100%
rename from src/MEDS_tabular_automl/configs/imputer/mode_imputer.yaml
rename to src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/imputer/mode_imputer.yaml
diff --git a/src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/normalization/__init__.py b/src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/normalization/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/MEDS_tabular_automl/configs/normalization/default.yaml b/src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/normalization/default.yaml
similarity index 100%
rename from src/MEDS_tabular_automl/configs/normalization/default.yaml
rename to src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/normalization/default.yaml
diff --git a/src/MEDS_tabular_automl/configs/normalization/max_abs_scaler.yaml b/src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/normalization/max_abs_scaler.yaml
similarity index 100%
rename from src/MEDS_tabular_automl/configs/normalization/max_abs_scaler.yaml
rename to src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/normalization/max_abs_scaler.yaml
diff --git a/src/MEDS_tabular_automl/configs/normalization/standard_scaler.yaml b/src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/normalization/standard_scaler.yaml
similarity index 100%
rename from src/MEDS_tabular_automl/configs/normalization/standard_scaler.yaml
rename to src/MEDS_tabular_automl/configs/model_launcher/data_processing_params/normalization/standard_scaler.yaml
diff --git a/src/MEDS_tabular_automl/configs/model_launcher/default.yaml b/src/MEDS_tabular_automl/configs/model_launcher/default.yaml
new file mode 100644
index 0000000..7b75e6e
--- /dev/null
+++ b/src/MEDS_tabular_automl/configs/model_launcher/default.yaml
@@ -0,0 +1,13 @@
+# @package _global_
+
+defaults:
+  - path: default
+  - data_processing_params: default
+  - data_loading_params: default
+  - _self_
+
+model_launcher:
+  path: ${path}
+  data_processing_params: ${data_processing_params}
+  data_loading_params: ${data_loading_params}
+  tabularization: ${tabularization}
diff --git a/src/MEDS_tabular_automl/configs/model_launcher/knn_classifier.yaml b/src/MEDS_tabular_automl/configs/model_launcher/knn_classifier.yaml
new file mode 100644
index 0000000..9f85e97
--- /dev/null
+++ b/src/MEDS_tabular_automl/configs/model_launcher/knn_classifier.yaml
@@ -0,0 +1,30 @@
+# @package _global_
+
+defaults:
+  - default
+  - training_params: default
+  - _self_
+
+model_launcher:
+  _target_: MEDS_tabular_automl.sklearn_model.SklearnModel.initialize
+
+  model:
+    _target_: sklearn.neighbors.KNeighborsClassifier
+    weights: "distance"
+    leaf_size: 30
+    p: 2
+    metric: "minkowski"
+
+  path:
+    model_file_extension: .pkl
+
+hydra:
+  sweeper:
+    params:
+      +model_launcher.model.n_neighbors: range(1, 20)
+      model_launcher.model.weights: choice('uniform', 'distance')
+      model_launcher.model.leaf_size: range(10, 50)
+      model_launcher.model.p: choice(1, 2)
+      model_launcher.model.metric: choice('minkowski', 'euclidean', 'manhattan')
+      model_launcher.training_params.epochs: range(10, 100)
+      model_launcher.training_params.early_stopping_rounds: range(1, 10)
diff --git a/src/MEDS_tabular_automl/configs/model_launcher/logistic_regression.yaml b/src/MEDS_tabular_automl/configs/model_launcher/logistic_regression.yaml
new file mode 100644
index 0000000..4531efc
--- /dev/null
+++ b/src/MEDS_tabular_automl/configs/model_launcher/logistic_regression.yaml
@@ -0,0 +1,33 @@
+# @package _global_
+
+defaults:
+  - default
+  - training_params: default
+  - _self_
+
+model_launcher:
+  _target_: MEDS_tabular_automl.sklearn_model.SklearnModel.initialize
+
+  model:
+    _target_: sklearn.linear_model.LogisticRegression
+    penalty: "l2"
+    dual: false
+    tol: 0.0001
+    C: 1.0
+    fit_intercept: True
+    intercept_scaling: 1
+    class_weight: null
+    random_state: null
+    solver: "lbfgs"
+    max_iter: 100
+
+  path:
+    model_file_extension: .pkl
+
+hydra:
+  sweeper:
+    params:
+      model_launcher.model.C: tag(log, interval(1e-6, 1))
+      model_launcher.model.solver: choice('lbfgs', 'sag', 'saga')
+      model_launcher.training_params.epochs: range(10, 100)
+      model_launcher.training_params.early_stopping_rounds: range(1, 10)
diff --git a/src/MEDS_tabular_automl/configs/model_launcher/path/__init__.py b/src/MEDS_tabular_automl/configs/model_launcher/path/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/MEDS_tabular_automl/configs/model_launcher/path/default.yaml b/src/MEDS_tabular_automl/configs/model_launcher/path/default.yaml
new file mode 100644
index 0000000..d739ce3
--- /dev/null
+++ b/src/MEDS_tabular_automl/configs/model_launcher/path/default.yaml
@@ -0,0 +1,10 @@
+input_tabularized_cache_dir: ${input_tabularized_cache_dir}
+input_label_cache_dir: ${input_label_cache_dir}
+output_model_dir: ${output_model_dir}
+model_file_stem: model
+model_file_extension: .json
+log_dir: ${log_dir}
+cache_dir: ${cache_dir}
+model_log_dir: ${output_model_dir}/.logs/
+performance_log_stem: performance
+config_log_stem: config
diff --git a/src/MEDS_tabular_automl/configs/model_launcher/random_forest_classifier.yaml b/src/MEDS_tabular_automl/configs/model_launcher/random_forest_classifier.yaml
new file mode 100644
index 0000000..4a50beb
--- /dev/null
+++ b/src/MEDS_tabular_automl/configs/model_launcher/random_forest_classifier.yaml
@@ -0,0 +1,37 @@
+# @package _global_
+
+defaults:
+  - default
+  - training_params: default
+  - _self_
+
+model_launcher:
+  _target_: MEDS_tabular_automl.sklearn_model.SklearnModel.initialize
+
+  model:
+    _target_: sklearn.ensemble.RandomForestClassifier
+    criterion: "gini"
+    max_depth: null
+    min_samples_split: 2
+    min_samples_leaf: 1
+    min_weight_fraction_leaf: 0.0
+    max_features: "sqrt"
+    max_leaf_nodes: null
+    min_impurity_decrease: 0.0
+    bootstrap: True
+
+  path:
+    model_file_extension: .pkl
+
+hydra:
+  sweeper:
+    params:
+      +model_launcher.model.n_estimators: range(50, 300, 50)
+      model_launcher.model.max_depth: choice(10, 20, 30, 40, 50)
+      model_launcher.model.min_samples_split: range(2, 11)
+      model_launcher.model.min_samples_leaf: range(1, 5)
+      model_launcher.model.max_features: choice('sqrt', 'log2')
+      model_launcher.model.bootstrap: choice(True, False)
+      model_launcher.model.criterion: choice('gini', 'entropy')
+      model_launcher.training_params.epochs: range(10, 100)
+      model_launcher.training_params.early_stopping_rounds: range(1, 10)
diff --git a/src/MEDS_tabular_automl/configs/model_launcher/sgd_classifier.yaml b/src/MEDS_tabular_automl/configs/model_launcher/sgd_classifier.yaml
new file mode 100644
index 0000000..9f6cb1d
--- /dev/null
+++ b/src/MEDS_tabular_automl/configs/model_launcher/sgd_classifier.yaml
@@ -0,0 +1,24 @@
+# @package _global_
+defaults:
+  - default
+  - training_params: default
+  - _self_
+
+model_launcher:
+  _target_: MEDS_tabular_automl.sklearn_model.SklearnModel.initialize
+
+  model:
+    _target_: sklearn.linear_model.SGDClassifier
+    loss: log_loss
+
+  path:
+    model_file_extension: .pkl
+
+hydra:
+  sweeper:
+    params:
+      +model_launcher.model.alpha: tag(log, interval(1e-6, 1))
+      +model_launcher.model.l1_ratio: interval(0, 1)
+      +model_launcher.model.penalty: choice('l1', 'l2', 'elasticnet')
+      model_launcher.training_params.epochs: range(10, 100)
+      model_launcher.training_params.early_stopping_rounds: range(1, 10)
diff --git a/src/MEDS_tabular_automl/configs/model_launcher/training_params/__init__.py b/src/MEDS_tabular_automl/configs/model_launcher/training_params/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/MEDS_tabular_automl/configs/model_launcher/training_params/default.yaml b/src/MEDS_tabular_automl/configs/model_launcher/training_params/default.yaml
new file mode 100644
index 0000000..abd29f2
--- /dev/null
+++ b/src/MEDS_tabular_automl/configs/model_launcher/training_params/default.yaml
@@ -0,0 +1,6 @@
+# @package _global_
+
+model_launcher:
+  training_params:
+    epochs: 20
+    early_stopping_rounds: 5
diff --git a/src/MEDS_tabular_automl/configs/model_launcher/xgboost.yaml b/src/MEDS_tabular_automl/configs/model_launcher/xgboost.yaml
new file mode 100644
index 0000000..b7e9065
--- /dev/null
+++ b/src/MEDS_tabular_automl/configs/model_launcher/xgboost.yaml
@@ -0,0 +1,31 @@
+# @package _global_
+defaults:
+  - default
+  - _self_
+
+model_launcher:
+  _target_: MEDS_tabular_automl.xgboost_model.XGBoostModel.initialize
+
+  model:
+    booster: gbtree
+    device: cpu
+    nthread: 1
+    tree_method: hist
+    objective: binary:logistic
+
+  training_params:
+    num_boost_round: 1000
+    early_stopping_rounds: 5
+
+hydra:
+  sweeper:
+    params:
+      +model_launcher.model.eta: tag(log, interval(0.001, 1))
+      +model_launcher.model.lambda: tag(log, interval(0.001, 1))
+      +model_launcher.model.alpha: tag(log, interval(0.001, 1))
+      +model_launcher.model.subsample: interval(0.5, 1)
+      +model_launcher.model.min_child_weight: interval(1e-2, 100)
+      +model_launcher.model.max_depth: range(2, 16)
+      model_launcher.training_params.num_boost_round: range(100, 1000)
+      model_launcher.training_params.early_stopping_rounds: range(1, 10)
+      tabularization.min_code_inclusion_frequency: tag(log, range(10, 1000000))
diff --git a/src/MEDS_tabular_automl/configs/tabularization.yaml b/src/MEDS_tabular_automl/configs/tabularization.yaml
index cf03d63..5d74eb9 100644
--- a/src/MEDS_tabular_automl/configs/tabularization.yaml
+++ b/src/MEDS_tabular_automl/configs/tabularization.yaml
@@ -5,8 +5,8 @@ defaults:
 
 # Raw data
 # Where the code metadata is stored
-input_code_metadata_fp: ${output_cohort_dir}/metadata/codes.parquet
-input_dir: ${output_cohort_dir}/data
-output_dir: ${output_cohort_dir}/tabularize
+input_code_metadata_fp: ${output_dir}/metadata/codes.parquet
+input_dir: ${input_dir}
+output_tabularized_dir: ${output_dir}/tabularize
 
 name: tabularization
diff --git a/src/MEDS_tabular_automl/configs/tabularization/default.yaml b/src/MEDS_tabular_automl/configs/tabularization/default.yaml
index ada7dc9..8c51383 100644
--- a/src/MEDS_tabular_automl/configs/tabularization/default.yaml
+++ b/src/MEDS_tabular_automl/configs/tabularization/default.yaml
@@ -1,5 +1,5 @@
 # User inputs
-filtered_code_metadata_fp: ${output_cohort_dir}/metadata/codes.parquet
+filtered_code_metadata_fp: ${output_dir}/metadata/codes.parquet
 allowed_codes: null
 min_code_inclusion_count: 10
 min_code_inclusion_frequency: null
diff --git a/src/MEDS_tabular_automl/configs/task_specific_caching.yaml b/src/MEDS_tabular_automl/configs/task_specific_caching.yaml
index 63fed0f..a372134 100644
--- a/src/MEDS_tabular_automl/configs/task_specific_caching.yaml
+++ b/src/MEDS_tabular_automl/configs/task_specific_caching.yaml
@@ -2,15 +2,16 @@ defaults:
   - default
   - tabularization: default
   - _self_
-task_name: task
+task_name: ???
 
-# Tabularized Data
-input_dir: ${output_cohort_dir}/tabularize
+# Directory of tabularized data
+input_tabularized_dir: ${output_dir}/tabularize
 # Where the labels are stored, with columns subject_id, timestamp, label
-input_label_dir: ${MEDS_cohort_dir}/tasks/${task_name}/
-# Where to output the task specific tabularized data
-output_dir: ${output_cohort_dir}/${task_name}/task_cache
-output_label_dir: ${output_cohort_dir}/${task_name}/labels
+input_label_dir: ???
+# Where to output the task, split, and shard specific tabularized data
+output_tabularized_cache_dir: ${output_dir}/${task_name}/task_cache
+# Where to output the task, split, and shard specific label data
+output_label_cache_dir: ${output_dir}/${task_name}/labels
 
 label_column: "boolean_value"
 
diff --git a/src/MEDS_tabular_automl/describe_codes.py b/src/MEDS_tabular_automl/describe_codes.py
index 70c53bd..23cdb98 100644
--- a/src/MEDS_tabular_automl/describe_codes.py
+++ b/src/MEDS_tabular_automl/describe_codes.py
@@ -2,7 +2,7 @@
 
 import polars as pl
 
-from MEDS_tabular_automl.utils import DF_T, get_feature_names
+from MEDS_tabular_automl.utils import get_feature_names
 
 
 def convert_to_df(freq_dict: dict[str, int]) -> pl.DataFrame:
@@ -65,7 +65,7 @@ def convert_to_freq_dict(df: pl.LazyFrame) -> dict[str, dict[int, int]]:
     return dict(df.collect().iter_rows())
 
 
-def compute_feature_frequencies(shard_df: DF_T) -> pl.DataFrame:
+def compute_feature_frequencies(shard_df: pl.LazyFrame) -> pl.DataFrame:
     """Generates a DataFrame containing the frequencies of codes and numerical values under different
     aggregations by computing frequency counts for certain attributes and organizing the results into specific
     categories based on the dataset's features.
diff --git a/src/MEDS_tabular_automl/file_name.py b/src/MEDS_tabular_automl/file_name.py
index 4ca144a..36b2861 100644
--- a/src/MEDS_tabular_automl/file_name.py
+++ b/src/MEDS_tabular_automl/file_name.py
@@ -73,7 +73,7 @@ def get_model_files(cfg: DictConfig, split: str, shard: str) -> list[Path]:
 
     Examples:
         >>> cfg = DictConfig({
-        ...     "input_dir": "data",
+        ...     "path": DictConfig({"input_tabularized_cache_dir" : "data"}),
         ...     "tabularization": {
         ...         "window_sizes": ["1d", "7d"],
         ...         "aggs": ["code/count", "value/sum", "static/present"],
@@ -94,7 +94,7 @@ def get_model_files(cfg: DictConfig, split: str, shard: str) -> list[Path]:
     """
     window_sizes = cfg.tabularization.window_sizes
     aggs = cfg.tabularization.aggs
-    shard_dir = Path(cfg.input_dir) / split / shard
+    shard_dir = Path(cfg.path.input_tabularized_cache_dir) / split / shard
     # Given a shard number, returns the model files
     model_files = []
     for window_size in window_sizes:
diff --git a/src/MEDS_tabular_automl/scripts/cache_task.py b/src/MEDS_tabular_automl/scripts/cache_task.py
index 0903aa3..a437119 100644
--- a/src/MEDS_tabular_automl/scripts/cache_task.py
+++ b/src/MEDS_tabular_automl/scripts/cache_task.py
@@ -25,6 +25,7 @@
     hydra_loguru_init,
     load_matrix,
     load_tqdm,
+    stage_init,
     write_df,
 )
 
@@ -79,17 +80,36 @@ def main(cfg: DictConfig):
     Args:
         cfg: The configuration for processing, loaded from a YAML file.
     """
+    stage_init(
+        cfg,
+        [
+            "input_dir",
+            "input_label_dir",
+            "input_tabularized_dir",
+            "output_dir",
+            "tabularization.filtered_code_metadata_fp",
+        ],
+    )
     iter_wrapper = load_tqdm(cfg.tqdm)
     if not cfg.loguru_init:
         hydra_loguru_init()
     # Produce ts representation
 
     # shuffle tasks
-    tabularization_tasks = list_subdir_files(cfg.input_dir, "npz")
+    tabularization_tasks = list_subdir_files(cfg.input_tabularized_dir, "npz")
+    if len(tabularization_tasks) == 0:
+        raise FileNotFoundError(
+            f"No tabularized data found, `input_tabularized_dir`: {cfg.input_tabularized_dir}, "
+            "is likely incorrect"
+        )
 
     np.random.shuffle(tabularization_tasks)
 
     label_dir = Path(cfg.input_label_dir)
+    if not label_dir.exists():
+        raise FileNotFoundError(
+            f"Label directory {label_dir} does not exist, please check the `input_label_dir` kwarg"
+        )
     label_df = (
         pl.scan_parquet(label_dir / "**/*.parquet")
         .rename(
@@ -108,9 +128,11 @@ def main(cfg: DictConfig):
     for data_fp in iter_wrapper(tabularization_tasks):
         # parse as time series agg
         split, shard_num, window_size, code_type, agg_name = Path(data_fp).with_suffix("").parts[-5:]
-        meds_data_in_fp = Path(cfg.output_cohort_dir) / "data" / split / f"{shard_num}.parquet"
-        shard_label_fp = Path(cfg.output_label_dir) / split / f"{shard_num}.parquet"
-        out_fp = (Path(cfg.output_dir) / get_shard_prefix(cfg.input_dir, data_fp)).with_suffix(".npz")
+        meds_data_in_fp = Path(cfg.input_dir) / split / f"{shard_num}.parquet"
+        shard_label_fp = Path(cfg.output_label_cache_dir) / split / f"{shard_num}.parquet"
+        out_fp = (
+            Path(cfg.output_tabularized_cache_dir) / get_shard_prefix(cfg.input_tabularized_dir, data_fp)
+        ).with_suffix(".npz")
 
         def read_meds_data_df(meds_data_fp):
             if "numeric_value" not in pl.scan_parquet(meds_data_fp).columns:
diff --git a/src/MEDS_tabular_automl/scripts/describe_codes.py b/src/MEDS_tabular_automl/scripts/describe_codes.py
index c29b542..4391ccf 100644
--- a/src/MEDS_tabular_automl/scripts/describe_codes.py
+++ b/src/MEDS_tabular_automl/scripts/describe_codes.py
@@ -17,7 +17,7 @@
 )
 from ..file_name import list_subdir_files
 from ..mapper import wrap as rwlock_wrap
-from ..utils import get_shard_prefix, hydra_loguru_init, load_tqdm, write_df
+from ..utils import get_shard_prefix, hydra_loguru_init, load_tqdm, stage_init, write_df
 
 config_yaml = files("MEDS_tabular_automl").joinpath("configs/describe_codes.yaml")
 if not config_yaml.is_file():
@@ -32,6 +32,7 @@ def main(cfg: DictConfig):
         cfg: The configuration object for the tabularization process, loaded from a Hydra
             YAML configuration file.
     """
+    stage_init(cfg, ["input_dir"])
     iter_wrapper = load_tqdm(cfg.tqdm)
     if not cfg.loguru_init:
         hydra_loguru_init()
diff --git a/src/MEDS_tabular_automl/scripts/launch_autogluon.py b/src/MEDS_tabular_automl/scripts/launch_autogluon.py
index db61e9f..ccbc93d 100644
--- a/src/MEDS_tabular_automl/scripts/launch_autogluon.py
+++ b/src/MEDS_tabular_automl/scripts/launch_autogluon.py
@@ -1,20 +1,33 @@
+import json
 from importlib.resources import files
 from pathlib import Path
 
 import hydra
 import pandas as pd
 from loguru import logger
-from omegaconf import DictConfig
+from omegaconf import DictConfig, OmegaConf
+
+try:
+    import autogluon.tabular as ag
+except ImportError:
+    ag = None
 
 from MEDS_tabular_automl.dense_iterator import DenseIterator
 
-from ..utils import hydra_loguru_init
+from ..utils import hydra_loguru_init, stage_init
 
-config_yaml = files("MEDS_tabular_automl").joinpath("configs/launch_autogluon.yaml")
+config_yaml = files("MEDS_tabular_automl").joinpath("configs/launch_model.yaml")
 if not config_yaml.is_file():
     raise FileNotFoundError("Core configuration not successfully installed!")
 
 
+def check_autogluon():
+    if ag is None:
+        raise ImportError(
+            "AutoGluon could not be imported. Please try installing it using: `pip install autogluon`"
+        )
+
+
 @hydra.main(version_base=None, config_path=str(config_yaml.parent.resolve()), config_name=config_yaml.stem)
 def main(cfg: DictConfig) -> float:
     """Launches AutoGluon after collecting data based on the provided configuration.
@@ -22,17 +35,13 @@ def main(cfg: DictConfig) -> float:
     Args:
         cfg: The configuration dictionary specifying model and training parameters.
     """
-
-    # print(OmegaConf.to_yaml(cfg))
+    check_autogluon()
+    stage_init(
+        cfg, ["input_dir", "input_label_cache_dir", "output_dir", "tabularization.filtered_code_metadata_fp"]
+    )
     if not cfg.loguru_init:
         hydra_loguru_init()
 
-    # check that autogluon is installed
-    try:
-        import autogluon.tabular as ag
-    except ImportError:
-        logger.error("AutoGluon is not installed. Please install AutoGluon.")
-
     # collect data based on the configuration
     itrain = DenseIterator(cfg, "train")
     ituning = DenseIterator(cfg, "tuning")
@@ -44,13 +53,13 @@ def main(cfg: DictConfig) -> float:
     held_out_data, held_out_labels = iheld_out.densify()
 
     # construct dfs for AutoGluon
-    train_df = pd.DataFrame(train_data.todense())  # , columns=cols)
+    train_df = pd.DataFrame(train_data.todense())
     train_df[cfg.task_name] = train_labels
     tuning_df = pd.DataFrame(
         tuning_data.todense(),
-    )  # columns=cols)
+    )
     tuning_df[cfg.task_name] = tuning_labels
-    held_out_df = pd.DataFrame(held_out_data.todense())  # , columns=cols)
+    held_out_df = pd.DataFrame(held_out_data.todense())
     held_out_df[cfg.task_name] = held_out_labels
 
     train_dataset = ag.TabularDataset(train_df)
@@ -58,8 +67,13 @@ def main(cfg: DictConfig) -> float:
     held_out_dataset = ag.TabularDataset(held_out_df)
 
     # train model with AutoGluon
+    log_filepath = Path(cfg.path.model_log_dir) / f"{cfg.path.config_log_stem}_log.txt"
+
     predictor = ag.TabularPredictor(
-        label=cfg.task_name, log_to_file=True, log_file_path=cfg.log_filepath, path=cfg.output_filepath
+        label=cfg.task_name,
+        log_to_file=True,
+        log_file_path=str(log_filepath.resolve()),
+        path=cfg.output_model_dir,
     ).fit(train_data=train_dataset, tuning_data=tuning_dataset)
 
     # predict
@@ -69,12 +83,17 @@ def main(cfg: DictConfig) -> float:
     score = predictor.evaluate(held_out_dataset)
     logger.info("Test score:", score)
 
-    log_fp = Path(cfg.model_log_dir)
-    log_fp.mkdir(parents=True, exist_ok=True)
-    # log hyperparameters
-    out_fp = log_fp / "trial_performance_results.log"
-    with open(out_fp, "w") as f:
-        f.write(f"{cfg.output_filepath}\t{cfg.tabularization}\t{cfg.model_params}\t{None}\t{score}\n")
+    model_performance_log_filepath = Path(cfg.path.model_log_dir) / f"{cfg.path.performance_log_stem}.json"
+    model_performance_log_filepath.parent.mkdir(parents=True, exist_ok=True)
+    # store results
+    performance_dict = {
+        "output_model_dir": cfg.path.output_model_dir,
+        "tabularization": OmegaConf.to_container(cfg.tabularization),
+        "model_launcher": OmegaConf.to_container(cfg.model_launcher),
+        "score": score,
+    }
+    with open(model_performance_log_filepath, "w") as f:
+        json.dump(performance_dict, f)
 
 
 if __name__ == "__main__":
diff --git a/src/MEDS_tabular_automl/scripts/launch_model.py b/src/MEDS_tabular_automl/scripts/launch_model.py
index 97943b4..9f8b8da 100644
--- a/src/MEDS_tabular_automl/scripts/launch_model.py
+++ b/src/MEDS_tabular_automl/scripts/launch_model.py
@@ -3,11 +3,11 @@
 from pathlib import Path
 
 import hydra
-from omegaconf import DictConfig, open_dict
+from omegaconf import DictConfig
 
 from MEDS_tabular_automl.base_model import BaseModel
 
-from ..utils import hydra_loguru_init, log_to_logfile
+from ..utils import hydra_loguru_init, log_to_logfile, stage_init
 
 config_yaml = files("MEDS_tabular_automl").joinpath("configs/launch_model.yaml")
 if not config_yaml.is_file():
@@ -24,32 +24,29 @@ def main(cfg: DictConfig) -> float:
     Returns:
         The evaluation result as the ROC AUC score on the held-out test set.
     """
+    stage_init(
+        cfg, ["input_dir", "input_label_cache_dir", "output_dir", "tabularization.filtered_code_metadata_fp"]
+    )
 
-    # print(OmegaConf.to_yaml(cfg))
     if not cfg.loguru_init:
         hydra_loguru_init()
 
-    model: BaseModel = hydra.utils.instantiate(cfg.model_target)
-    # TODO - make tabularuzation be copied in the yaml instead of here
-    with open_dict(cfg):
-        model.cfg.tabularization = hydra.utils.instantiate(cfg.tabularization)
+    model_launcher: BaseModel = hydra.utils.instantiate(cfg.model_launcher)
 
-    model.train()
-    auc = model.evaluate()
-    # logger.info(f"AUC: {auc}")
+    model_launcher.train()
+    auc = model_launcher.evaluate()
 
     # save model
-    output_fp = Path(cfg.model_saving.model_dir)
-    output_fp = (
-        output_fp.parent
-        / f"{cfg.model_saving.model_file_stem}_{auc:.4f}_{time.time()}{cfg.model_saving.model_file_extension}"
-    )
-    output_fp.parent.mkdir(parents=True, exist_ok=True)
+    output_model_dir = Path(cfg.output_model_dir)
+    path_cfg = model_launcher.cfg.path
+    model_filename = f"{path_cfg.model_file_stem}_{auc:.4f}_{time.time()}{path_cfg.model_file_extension}"
+    output_fp = output_model_dir / model_filename
+    output_model_dir.parent.mkdir(parents=True, exist_ok=True)
 
     # log to logfile
-    log_to_logfile(model, cfg, output_fp.stem)
+    log_to_logfile(model_launcher, cfg, output_fp.stem)
 
-    model.save_model(output_fp)
+    model_launcher.save_model(output_fp)
     return auc
 
 
diff --git a/src/MEDS_tabular_automl/scripts/tabularize_static.py b/src/MEDS_tabular_automl/scripts/tabularize_static.py
index 34d9c0d..2baf406 100644
--- a/src/MEDS_tabular_automl/scripts/tabularize_static.py
+++ b/src/MEDS_tabular_automl/scripts/tabularize_static.py
@@ -30,6 +30,7 @@
     get_shard_prefix,
     hydra_loguru_init,
     load_tqdm,
+    stage_init,
     write_df,
 )
 
@@ -56,7 +57,7 @@ def main(
 
     Args:
         cfg:
-            MEDS_cohort_dir: directory of MEDS format dataset that is ingested.
+            input_dir: directory of MEDS format dataset that is ingested.
             tabularized_data_dir: output directory of tabularized data.
             min_code_inclusion_frequency: The base feature inclusion frequency that should be used to dictate
                 what features can be included in the flat representation. It can either be a float, in which
@@ -81,6 +82,14 @@ def main(
 
     .. _link: https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.groupby_rolling.html # noqa: E501
     """
+    stage_init(
+        cfg,
+        [
+            "input_code_metadata_fp",
+            "input_dir",
+            "tabularization.filtered_code_metadata_fp",
+        ],
+    )
     iter_wrapper = load_tqdm(cfg.tqdm)
     if not cfg.loguru_init:
         hydra_loguru_init()
@@ -130,7 +139,7 @@ def write_fn(data, out_fp):
     np.random.shuffle(tabularization_tasks)
     for shard_fp, agg in iter_wrapper(tabularization_tasks):
         out_fp = (
-            Path(cfg.output_dir) / get_shard_prefix(cfg.input_dir, shard_fp) / "none" / agg
+            Path(cfg.output_tabularized_dir) / get_shard_prefix(cfg.input_dir, shard_fp) / "none" / agg
         ).with_suffix(".npz")
         if out_fp.exists() and not cfg.do_overwrite:
             raise FileExistsError(f"do_overwrite is {cfg.do_overwrite} and {out_fp} exists!")
diff --git a/src/MEDS_tabular_automl/scripts/tabularize_time_series.py b/src/MEDS_tabular_automl/scripts/tabularize_time_series.py
index 8eca7f4..98cca20 100644
--- a/src/MEDS_tabular_automl/scripts/tabularize_time_series.py
+++ b/src/MEDS_tabular_automl/scripts/tabularize_time_series.py
@@ -26,6 +26,7 @@
     get_shard_prefix,
     hydra_loguru_init,
     load_tqdm,
+    stage_init,
     write_df,
 )
 
@@ -64,6 +65,14 @@ def main(
         FileNotFoundError: If specified directories or files in the configuration are not found.
         ValueError: If required columns like 'code' or 'value' are missing in the data files.
     """
+    stage_init(
+        cfg,
+        [
+            "input_code_metadata_fp",
+            "input_dir",
+            "tabularization.filtered_code_metadata_fp",
+        ],
+    )
     iter_wrapper = load_tqdm(cfg.tqdm)
     if not cfg.loguru_init:
         hydra_loguru_init()
@@ -83,7 +92,7 @@ def main(
     # iterate through them
     for shard_fp, window_size, agg in iter_wrapper(tabularization_tasks):
         out_fp = (
-            Path(cfg.output_dir) / get_shard_prefix(cfg.input_dir, shard_fp) / window_size / agg
+            Path(cfg.output_tabularized_dir) / get_shard_prefix(cfg.input_dir, shard_fp) / window_size / agg
         ).with_suffix(".npz")
 
         def read_fn(in_fp):
diff --git a/src/MEDS_tabular_automl/sklearn_model.py b/src/MEDS_tabular_automl/sklearn_model.py
index 56063fd..12ed980 100644
--- a/src/MEDS_tabular_automl/sklearn_model.py
+++ b/src/MEDS_tabular_automl/sklearn_model.py
@@ -1,4 +1,5 @@
 from pathlib import Path
+from pickle import dump
 
 import numpy as np
 import scipy.sparse as sp
@@ -101,7 +102,7 @@ def __init__(self, cfg: DictConfig):
         """
         super().__init__()
         self.cfg = cfg
-        self.keep_data_in_memory = cfg.model_params.iterator.keep_data_in_memory
+        self.keep_data_in_memory = cfg.data_loading_params.keep_data_in_memory
 
         self.itrain = None
         self.ituning = None
@@ -111,7 +112,7 @@ def __init__(self, cfg: DictConfig):
         self.dtuning = None
         self.dheld_out = None
 
-        self.model = cfg.model_params.model
+        self.model = cfg.model
         # check that self.model is a valid model
         if not hasattr(self.model, "fit"):
             raise ValueError("Model does not have a fit method.")
@@ -133,7 +134,7 @@ def _fit_from_partial(self):
         classes = self.itrain.get_classes()
         best_auc = 0
         best_epoch = 0
-        for epoch in range(self.cfg.model_params.epochs):
+        for epoch in range(self.cfg.training_params.epochs):
             # train on each all data
             for shard_idx in range(len(self.itrain._data_shards)):
                 data, labels = self.itrain.get_data_shards(shard_idx)
@@ -144,7 +145,7 @@ def _fit_from_partial(self):
             if auc > best_auc:
                 best_auc = auc
                 best_epoch = epoch
-            if epoch - best_epoch > self.cfg.model_params.early_stopping_rounds:
+            if epoch - best_epoch > self.cfg.training_params.early_stopping_rounds:
                 break
 
     def _train(self):
@@ -224,9 +225,9 @@ def save_model(self, output_fp: str):
         if not hasattr(self.model, "save_model"):
             logger.info(f"Model {self.model.__class__.__name__} does not have a save_model method.")
             logger.info("Model will be saved using pickle dump.")
-            from pickle import dump
-
-            with open(output_fp.parent / "model.pkl", "wb") as f:
+            if not str(output_fp.resolve()).endswith(".pkl"):
+                raise ValueError("Model file extension must be .pkl.")
+            with open(output_fp, "wb") as f:
                 dump(self.model, f, protocol=5)
         else:
             self.model.save_model(output_fp)
diff --git a/src/MEDS_tabular_automl/tabular_dataset.py b/src/MEDS_tabular_automl/tabular_dataset.py
index 84a6609..fa402b7 100644
--- a/src/MEDS_tabular_automl/tabular_dataset.py
+++ b/src/MEDS_tabular_automl/tabular_dataset.py
@@ -47,13 +47,21 @@ def __init__(self, cfg: DictConfig, split: str = "train"):
             split: The data split to use, which can be one of "train", "tuning",
                 or "held_out". This determines which subset of the data is loaded and processed.
         """
-        super().__init__(cache_prefix=Path(cfg.cache_dir))
+        super().__init__(cache_prefix=Path(cfg.path.cache_dir))
         self.cfg = cfg
         self.split = split
         # Load shards for this split
         self._data_shards = sorted(
-            [shard.stem for shard in list_subdir_files(Path(cfg.input_label_dir) / split, "parquet")]
+            [
+                shard.stem
+                for shard in list_subdir_files(Path(cfg.path.input_label_cache_dir) / split, "parquet")
+            ]
         )
+        if len(self._data_shards) == 0:
+            raise ValueError(
+                "No labels found in the `input_label_cache_dir` "
+                + str(Path(cfg.path.input_label_cache_dir).resolve())
+            )
         self.valid_event_ids, self.labels = None, None
 
         self.codes_set, self.code_masks, self.num_features = self._get_code_set()
@@ -114,7 +122,7 @@ def _load_ids_and_labels(
             to lists of corresponding labels.
         """
         label_fps = {
-            shard: (Path(self.cfg.input_label_dir) / self.split / shard).with_suffix(".parquet")
+            shard: (Path(self.cfg.path.input_label_cache_dir) / self.split / shard).with_suffix(".parquet")
             for shard in self._data_shards
             for shard in self._data_shards
         }
@@ -126,7 +134,7 @@ def _load_ids_and_labels(
 
             if load_labels:
                 cached_labels[shard] = label_df.select(pl.col("label")).collect().to_series()
-                if self.cfg.model_params.iterator.binarize_task:
+                if self.cfg.data_loading_params.binarize_task:
                     cached_labels[shard] = cached_labels[shard].map_elements(
                         lambda x: 1 if x > 0 else 0, return_dtype=pl.Int8
                     )
@@ -221,10 +229,10 @@ def _get_approximate_correlation_per_feature(self, X: sp.csc_matrix, y: np.ndarr
     def _set_imputer(self):
         """Sets the imputer for the data."""
         if (
-            hasattr(self.cfg.model_params.iterator, "imputer")
-            and self.cfg.model_params.iterator.imputer.imputer_target
+            hasattr(self.cfg.data_loading_params, "imputer")
+            and self.cfg.data_loading_params.imputer.imputer_target
         ):
-            imputer = self.cfg.model_params.iterator.imputer.imputer_target
+            imputer = self.cfg.data_loading_params.imputer.imputer_target
             if hasattr(imputer, "partial_fit"):
                 for i in range(len(self._data_shards)):
                     X, _ = self._get_shard_by_index(i)
@@ -240,10 +248,10 @@ def _set_imputer(self):
     def _set_scaler(self):
         """Sets the scaler for the data."""
         if (
-            hasattr(self.cfg.model_params.iterator, "normalization")
-            and self.cfg.model_params.iterator.normalization.normalizer
+            hasattr(self.cfg.data_loading_params, "normalization")
+            and self.cfg.data_loading_params.normalization.normalizer
         ):
-            scaler = self.cfg.model_params.iterator.normalization.normalizer
+            scaler = self.cfg.data_loading_params.normalization.normalizer
             if hasattr(scaler, "partial_fit"):
                 for i in range(len(self._data_shards)):
                     X, _ = self._get_shard_by_index(i)
diff --git a/src/MEDS_tabular_automl/utils.py b/src/MEDS_tabular_automl/utils.py
index 3d6f496..97a74f9 100644
--- a/src/MEDS_tabular_automl/utils.py
+++ b/src/MEDS_tabular_automl/utils.py
@@ -1,21 +1,15 @@
-"""The base class for core dataset processing logic.
-
-Attributes:
-    INPUT_DF_T: This defines the type of the allowable input dataframes -- e.g., databases, filepaths,
-        dataframes, etc.
-    DF_T: This defines the type of internal dataframes -- e.g. polars DataFrames.
-"""
+"""The base class for core dataset processing logic and script utilities."""
 import os
+import sys
 from pathlib import Path
 
 import hydra
 import numpy as np
 import polars as pl
 from loguru import logger
-from omegaconf import OmegaConf
+from omegaconf import DictConfig, ListConfig, OmegaConf
 from scipy.sparse import coo_array
 
-DF_T = pl.LazyFrame
 WRITE_USE_PYARROW = True
 ROW_IDX_NAME = "__row_idx"
 
@@ -51,7 +45,7 @@ def filter_to_codes(
     min_code_inclusion_count: int | None,
     min_code_inclusion_frequency: float | None,
     max_include_codes: int | None,
-) -> list[str]:
+) -> ListConfig[str]:
     """Filters and returns codes based on allowed list and minimum frequency.
 
     Args:
@@ -69,8 +63,14 @@ def filter_to_codes(
         ...     pl.DataFrame({"code": ["E", "D", "A"], "count": [4, 3, 2]}).write_parquet(f.name)
         ...     filter_to_codes( f.name, ["A", "D"], 3, None, None)
         ['D']
+        >>> with NamedTemporaryFile() as f:
+        ...     pl.DataFrame({"code": ["E", "D", "A"], "count": [4, 3, 2]}).write_parquet(f.name)
+        ...     filter_to_codes( f.name, ["A", "D"], 10, None, None)
+        Traceback (most recent call last):
+        ...
+        ValueError: Code filtering criteria ...
+        ...
     """
-
     feature_freqs = pl.read_parquet(code_metadata_fp)
 
     if allowed_codes is not None:
@@ -78,18 +78,23 @@ def filter_to_codes(
 
     if min_code_inclusion_frequency is not None:
         pass
-        # need to consider size of the dataset vs count
-
-        # feature_freqs = feature_freqs.filter(pl.col("frequency") >= min_code_inclusion_frequency)
 
     if min_code_inclusion_count is not None:
         feature_freqs = feature_freqs.filter(pl.col("count") >= min_code_inclusion_count)
 
     if max_include_codes is not None:
-        # feature_freqs = feature_freqs.sort("count", reverse=True).head(max_include_codes)
         feature_freqs = feature_freqs.sort("count", descending=True).head(max_include_codes)
 
-    return sorted(feature_freqs["code"].to_list())
+    if len(feature_freqs["code"]) == 0:
+        raise ValueError(
+            f"Code filtering criteria leaves only 0 codes. Note that {feature_freqs.shape[0]} "
+            "codes are read in, try modifying the following kwargs:"
+            f"\n- tabularization.allowed_codes: {allowed_codes}"
+            f"\n- tabularization.min_code_inclusion_count: {min_code_inclusion_count}"
+            f"\n- tabularization.min_code_inclusion_frequency: {min_code_inclusion_frequency}"
+            f"\n- tabularization.max_include_codes: {max_include_codes}"
+        )
+    return ListConfig(sorted(feature_freqs["code"].to_list()))
 
 
 OmegaConf.register_new_resolver("filter_to_codes", filter_to_codes, replace=True)
@@ -405,20 +410,69 @@ def log_to_logfile(model, cfg, output_fp):
         cfg: The configuration dictionary.
         output_fp: The relative output file path.
     """
-    log_fp = Path(cfg.model_logging.model_log_dir)
+    log_fp = Path(cfg.path.model_log_dir)
 
     # make a folder to log everything for this model
     out_fp = log_fp / output_fp
     out_fp.mkdir(parents=True, exist_ok=True)
 
     # config as a json
-    config_fp = out_fp / f"{cfg.model_logging.config_log_stem}.log"
+    config_fp = out_fp / f"{cfg.path.config_log_stem}.json"
     with open(config_fp, "w") as f:
         f.write(OmegaConf.to_yaml(cfg))
 
-    model_performance_fp = out_fp / f"{cfg.model_logging.performance_log_stem}.log"
+    model_performance_fp = out_fp / f"{cfg.path.performance_log_stem}.log"
     with open(model_performance_fp, "w") as f:
         f.write("model_fp,tuning_auc,test_auc\n")
         f.write(f"{output_fp},{model.evaluate()},{model.evaluate(split='held_out')}\n")
 
     logger.debug(f"Model config and performance logged to {config_fp} and {model_performance_fp}")
+
+
+def current_script_name() -> str:
+    """Returns the name of the module that called this function."""
+
+    main_module = sys.modules["__main__"]
+    main_func = getattr(main_module, "main", None)
+    if main_func and callable(main_func):
+        func_module = main_func.__module__
+        if func_module == "__main__":
+            return Path(sys.argv[0]).stem
+        else:
+            return func_module.split(".")[-1]
+
+    logger.warning("Can't find main function in __main__ module. Using sys.argv[0] as a fallback.")
+    return Path(sys.argv[0]).stem
+
+
+def stage_init(cfg: DictConfig, keys: list[str]):
+    """Initializes the stage by logging the configuration and the stage-specific paths.
+
+    Args:
+        cfg: The global configuration object, which should have a ``cfg.stage_cfg`` attribute containing the
+            stage specific configuration.
+
+    Returns: The data input directory, stage output directory, and metadata input directory.
+    """
+    logger.info(
+        f"Running {current_script_name()} with the following configuration:\n{OmegaConf.to_yaml(cfg)}"
+    )
+
+    chk_kwargs = {k: OmegaConf.select(cfg, k) for k in keys}
+
+    def chk(x: Path | None) -> str:
+        if x is None:
+            return "❌"
+        return "✅" if x.exists() and str(x) != "" else "❌"
+
+    paths_strs = [
+        f"  - {k}: {chk(Path(v) if v is not None else None)} "
+        f"{str(Path(v).resolve()) if v is not None else 'None'}"
+        for k, v in chk_kwargs.items()
+    ]
+
+    logger_strs = [
+        f"Stage config:\n{OmegaConf.to_yaml(cfg)}",
+        "Paths: (checkbox indicates if it exists)",
+    ]
+    logger.debug("\n".join(logger_strs + paths_strs))
diff --git a/src/MEDS_tabular_automl/xgboost_model.py b/src/MEDS_tabular_automl/xgboost_model.py
index ec33c15..f95563f 100644
--- a/src/MEDS_tabular_automl/xgboost_model.py
+++ b/src/MEDS_tabular_automl/xgboost_model.py
@@ -43,7 +43,7 @@ def __init__(self, cfg: DictConfig, split: str):
             cfg: The configuration dictionary.
             split: The data split to use.
         """
-        xgb.DataIter.__init__(self, cache_prefix=Path(cfg.cache_dir))
+        xgb.DataIter.__init__(self, cache_prefix=Path(cfg.path.cache_dir))
         TabularDataset.__init__(self, cfg=cfg, split=split)
         self.valid_event_ids, self.labels = self._load_ids_and_labels()
         # check if the labels are empty
@@ -107,7 +107,7 @@ def __init__(self, cfg: DictConfig):
         """
         super().__init__()
         self.cfg = cfg
-        self.keep_data_in_memory = cfg.model_params.iterator.keep_data_in_memory
+        self.keep_data_in_memory = cfg.data_loading_params.keep_data_in_memory
 
         self.itrain = None
         self.ituning = None
@@ -131,11 +131,10 @@ def _build(self):
     def _train(self):
         """Trains the model."""
         self.model = xgb.train(
-            OmegaConf.to_container(self.cfg.model_params.model),
+            OmegaConf.to_container(self.cfg.model),
             self.dtrain,
-            num_boost_round=self.cfg.model_params.num_boost_round,
-            early_stopping_rounds=self.cfg.model_params.early_stopping_rounds,
-            # nthreads=self.cfg.nthreads,
+            num_boost_round=self.cfg.training_params.num_boost_round,
+            early_stopping_rounds=self.cfg.training_params.early_stopping_rounds,
             evals=[(self.dtrain, "train"), (self.dtuning, "tuning")],
             verbose_eval=0,
         )
diff --git a/tests/test_configs.py b/tests/test_configs.py
index 708d270..d0bbc45 100644
--- a/tests/test_configs.py
+++ b/tests/test_configs.py
@@ -5,17 +5,14 @@
 import subprocess
 
 import hydra
+import polars as pl
 import pytest
 from hydra import compose, initialize
-from hydra.core.hydra_config import HydraConfig
-from loguru import logger
+from omegaconf import DictConfig, OmegaConf
 
 from MEDS_tabular_automl.sklearn_model import SklearnModel
 from MEDS_tabular_automl.xgboost_model import XGBoostModel
 
-logger.disable("MEDS_tabular_automl")
-from omegaconf import OmegaConf
-
 
 def run_command(script: str, args: list[str], hydra_kwargs: dict[str, str], test_name: str):
     command_parts = [script] + args + [f"{k}={v}" for k, v in hydra_kwargs.items()]
@@ -28,46 +25,91 @@ def run_command(script: str, args: list[str], hydra_kwargs: dict[str, str], test
 
 
 def make_config_mutable(cfg):
-    OmegaConf.set_readonly(cfg, False)
-    for key in cfg:
-        if isinstance(cfg[key], OmegaConf):
-            make_config_mutable(cfg[key])
+    if OmegaConf.is_config(cfg):
+        OmegaConf.set_readonly(cfg, False)
+        for key in cfg.keys():
+            print(key)
+            # try:
+            cfg[key] = make_config_mutable(cfg[key])
+            # except:
+            #     import pdb; pdb.set_trace()
+        return cfg
+    # elif isinstance(cfg, list):
+    #     return [make_config_mutable(item) for item in cfg]
+    # elif isinstance(cfg, dict):
+    #     return {key: make_config_mutable(value) for key, value in cfg.items()}
+    else:
+        return cfg
 
 
 @pytest.mark.parametrize(
-    "model",
-    ["xgboost", "sgd_classifier", "knn_classifier", "logistic_regression", "random_forest_classifier"],
+    "model_launcher_override",
+    [
+        "xgboost",
+        "sgd_classifier",
+        "knn_classifier",
+        "logistic_regression",
+        "random_forest_classifier",
+        "autogluon",
+    ],
 )
 @pytest.mark.parametrize("imputer", ["default", "mean_imputer", "mode_imputer", "median_imputer"])
 @pytest.mark.parametrize("normalization", ["standard_scaler", "max_abs_scaler"])
-def test_model_config(model, imputer, normalization):
-    MEDS_cohort_dir = "blah"
+def test_model_config(model_launcher_override, imputer, normalization, tmp_path):
+    input_dir = "/foo/"
+    code_metadata_fp = f"/{str(tmp_path)}/codes.parquet"
+    model_launcher_config_kwargs = {
+        "input_dir": input_dir,
+        "output_dir": "/bar/",
+        "output_model_dir": "/baz/",
+        "++tabularization.filtered_code_metadata_fp": code_metadata_fp,
+        "++tabularization.min_code_inclusion_count": "0",
+        "task_name": "foo_bar",
+    }
+    pl.DataFrame({"code": ["E", "D", "A"], "count": [4, 3, 2]}).write_parquet(code_metadata_fp)
+
+    with initialize(version_base=None, config_path="../src/MEDS_tabular_automl/configs/"):
+        overrides = [
+            f"model_launcher={model_launcher_override}",
+            f"data_processing_params.imputer={imputer}",
+            f"data_processing_params.normalization={normalization}",
+        ] + [f"{k}={v}" for k, v in model_launcher_config_kwargs.items()]
+        cfg = compose(config_name="launch_model", overrides=overrides, return_hydra_config=True)
+
+    model_launcher = hydra.utils.instantiate(cfg.model_launcher)
+    match model_launcher_override:
+        case "xgboost":
+            assert isinstance(
+                model_launcher, XGBoostModel
+            ), "model_launcher should be an instance of XGBoostModel"
+        case "autogluon":
+            assert isinstance(
+                model_launcher, DictConfig
+            ), "model_launcher should not be a DictConfig for autogluon"
+        case _:
+            assert isinstance(
+                model_launcher, SklearnModel
+            ), "model_launcher should be an instance of SklearnModel"
+    assert cfg.tabularization.window_sizes
+
+
+def test_generate_subsets_configs():
+    input_dir = "blah"
+    stderr, stdout_ws = run_command("generate-subsets", ["[30d]"], {}, "generate-subsets window_sizes")
+    stderr, stdout_agg = run_command("generate-subsets", ["[static/present]"], {}, "generate-subsets aggs")
     xgboost_config_kwargs = {
-        "MEDS_cohort_dir": MEDS_cohort_dir,
-        "output_cohort_dir": "blah",
+        "input_dir": input_dir,
+        "output_dir": "blah",
         "do_overwrite": False,
         "seed": 1,
         "hydra.verbose": True,
         "tqdm": False,
         "loguru_init": True,
         "tabularization.min_code_inclusion_count": 1,
-        "tabularization.window_sizes": "[30d,365d,full]",
-        "tabularization._resolved_codes": "[test,test2]",
+        "tabularization.window_sizes": f"{stdout_ws.strip()}",
     }
 
-    with initialize(
-        version_base=None, config_path="../src/MEDS_tabular_automl/configs/"
-    ):  # path to config.yaml
-        overrides = [f"model={model}", f"imputer={imputer}", f"normalization={normalization}"] + [
-            f"{k}={v}" for k, v in xgboost_config_kwargs.items()
-        ]
-        cfg = compose(
-            config_name="launch_model", overrides=overrides, return_hydra_config=True
-        )  # config.yaml
-
-    HydraConfig().set_config(cfg)
-    # make_config_mutable(cfg)
-    expected_model_class = XGBoostModel if model == "xgboost" else SklearnModel
-    model = hydra.utils.instantiate(cfg.model_target)
-    assert isinstance(model, expected_model_class)
-    # assert cfg.tabularization.window_sizes
+    with initialize(version_base=None, config_path="../src/MEDS_tabular_automl/configs/"):
+        overrides = [f"{k}={v}" for k, v in xgboost_config_kwargs.items()]
+        cfg = compose(config_name="launch_model", overrides=overrides)
+    assert cfg.tabularization.window_sizes
diff --git a/tests/test_integration.py b/tests/test_integration.py
index 055070c..d231be1 100644
--- a/tests/test_integration.py
+++ b/tests/test_integration.py
@@ -13,6 +13,7 @@
     CODE_COLS,
     EXPECTED_STATIC_FILES,
     MEDS_OUTPUTS,
+    NUM_SHARDS,
     SPLITS_JSON,
     STATIC_FIRST_COLS,
     STATIC_PRESENT_COLS,
@@ -43,12 +44,14 @@ def run_command(script: str, args: list[str], hydra_kwargs: dict[str, str], test
 
 def test_integration(tmp_path):
     # Step 0: Setup Environment
-    MEDS_cohort_dir = Path(tmp_path) / "MEDS_cohort_dir"
-    output_cohort_dir = Path(tmp_path) / "output_cohort_dir"
+    input_dir = Path(tmp_path) / "input_dir"
+    output_dir = Path(tmp_path) / "output_dir"
+    input_label_dir = Path(tmp_path) / "label_dir"
+    output_model_dir = Path(tmp_path) / "output_model_dir"
 
     shared_config = {
-        "MEDS_cohort_dir": str(MEDS_cohort_dir.resolve()),
-        "output_cohort_dir": str(output_cohort_dir.resolve()),
+        "input_dir": str(input_dir.resolve()),
+        "output_dir": str(output_dir.resolve()),
         "do_overwrite": False,
         "seed": 1,
         "hydra.verbose": True,
@@ -58,35 +61,34 @@ def test_integration(tmp_path):
 
     describe_codes_config = {**shared_config}
 
-    with initialize(
-        version_base=None, config_path="../src/MEDS_tabular_automl/configs/"
-    ):  # path to config.yaml
+    with initialize(version_base=None, config_path="../src/MEDS_tabular_automl/configs/"):
         overrides = [f"{k}={v}" for k, v in describe_codes_config.items()]
-        cfg = compose(config_name="describe_codes", overrides=overrides)  # config.yaml
+        cfg = compose(config_name="describe_codes", overrides=overrides)
 
     # Create the directories
-    (output_cohort_dir / "data").mkdir(parents=True, exist_ok=True)
+    (output_dir).mkdir(parents=True, exist_ok=True)
 
     # Store MEDS outputs
     all_data = []
     for split, data in MEDS_OUTPUTS.items():
-        file_path = output_cohort_dir / "data" / f"{split}.parquet"
-        file_path.parent.mkdir(exist_ok=True)
+        file_path = input_dir / f"{split}.parquet"
+        file_path.parent.mkdir(exist_ok=True, parents=True)
         df = pl.read_csv(StringIO(data)).with_columns(pl.col("time").str.to_datetime("%Y-%m-%dT%H:%M:%S%.f"))
         df.write_parquet(file_path)
         all_data.append(df)
+        assert file_path.exists()
 
     all_data = pl.concat(all_data, how="diagonal_relaxed").sort(by=["subject_id", "time"])
 
     # Check the files are not empty
     meds_files = list_subdir_files(Path(cfg.input_dir), "parquet")
     assert (
-        len(list_subdir_files(Path(cfg.input_dir).parent, "parquet")) == 4
+        len(list_subdir_files(Path(cfg.input_dir), "parquet")) == 4
     ), "MEDS train split Data Files Should be 4!"
     for f in meds_files:
         assert pl.read_parquet(f).shape[0] > 0, "MEDS Data Tabular Dataframe Should not be Empty!"
     split_json = json.load(StringIO(SPLITS_JSON))
-    splits_fp = output_cohort_dir / ".shards.json"
+    splits_fp = input_dir / ".shards.json"
     json.dump(split_json, splits_fp.open("w"))
 
     # Step 1: Run the describe_codes script
@@ -96,6 +98,7 @@ def test_integration(tmp_path):
         describe_codes_config,
         "describe_codes",
     )
+
     assert Path(cfg.output_filepath).is_file()
 
     feature_columns = get_feature_columns(cfg.output_filepath)
@@ -106,7 +109,7 @@ def test_integration(tmp_path):
         assert get_feature_names(value_agg, feature_columns) == sorted(VALUE_COLS)
 
     # Step 2: Run the static data tabularization script
-    tabularize_config = {
+    tabularize_static_config = {
         **shared_config,
         "tabularization.min_code_inclusion_count": 1,
         "tabularization.window_sizes": "[30d,365d,full]",
@@ -114,17 +117,17 @@ def test_integration(tmp_path):
     stderr, stdout = run_command(
         "meds-tab-tabularize-static",
         [],
-        tabularize_config,
+        tabularize_static_config,
         "tabularization",
     )
-    with initialize(
-        version_base=None, config_path="../src/MEDS_tabular_automl/configs/"
-    ):  # path to config.yaml
-        overrides = [f"{k}={v}" for k, v in tabularize_config.items()]
-        cfg = compose(config_name="tabularization", overrides=overrides)  # config.yaml
-
-    output_files = list(Path(cfg.output_dir).glob("**/static/**/*.npz"))
-    actual_files = [get_shard_prefix(Path(cfg.output_dir), each) + ".npz" for each in output_files]
+    with initialize(version_base=None, config_path="../src/MEDS_tabular_automl/configs/"):
+        overrides = [f"{k}={v}" for k, v in tabularize_static_config.items()]
+        cfg = compose(config_name="tabularization", overrides=overrides)
+
+    output_dir = Path(cfg.output_dir) / "tabularize"
+
+    output_files = list(output_dir.glob("**/static/**/*.npz"))
+    actual_files = [get_shard_prefix(output_dir, each) + ".npz" for each in output_files]
     assert set(actual_files) == set(EXPECTED_STATIC_FILES)
     # Check the files are not empty
     for f in output_files:
@@ -168,11 +171,9 @@ def test_integration(tmp_path):
     )
 
     # confirm summary files exist:
-    output_files = list_subdir_files(cfg.output_dir, "npz")
+    output_files = list_subdir_files(str(output_dir.resolve()), "npz")
     actual_files = [
-        get_shard_prefix(Path(cfg.output_dir), each) + ".npz"
-        for each in output_files
-        if "none/static" not in str(each)
+        get_shard_prefix(output_dir, each) + ".npz" for each in output_files if "none/static" not in str(each)
     ]
     assert len(actual_files) > 0
     for f in output_files:
@@ -194,18 +195,36 @@ def test_integration(tmp_path):
         assert ts_matrix.shape[0] == expected_num_rows, (
             f"Time-Series Data matrix Should have {expected_num_rows}" f" rows but has {ts_matrix.shape[0]}!"
         )
+    output_files = list_subdir_files(str(Path(cfg.output_tabularized_dir).resolve()), "npz")
+    for split in split_json.keys():
+        for window in cfg.tabularization.window_sizes:
+            for agg in cfg.tabularization.aggs:
+                if agg.startswith("static"):
+                    if window != cfg.tabularization.window_sizes[0]:
+                        continue
+                    expected_fp = Path(cfg.output_tabularized_dir) / split / "none" / f"{agg}.npz"
+                else:
+                    expected_fp = Path(cfg.output_tabularized_dir) / split / window / f"{agg}.npz"
+                assert expected_fp in output_files, f"Missing {expected_fp}"
+    expected_num_time_tabs = (
+        NUM_SHARDS * len(cfg.tabularization.window_sizes) * (len(cfg.tabularization.aggs) - 2)
+    )
+    expected_num_static_tabs = NUM_SHARDS * 2
+    assert len(list_subdir_files(cfg.output_dir, "npz")) == expected_num_time_tabs + expected_num_static_tabs
+
     # Step 4: Run the task_specific_caching script
     cache_config = {
         **shared_config,
         "tabularization.min_code_inclusion_count": 1,
         "tabularization.window_sizes": "[30d,365d,full]",
+        "task_name": "test_task",
+        "input_label_dir": str(input_label_dir.resolve()),
     }
-    with initialize(
-        version_base=None, config_path="../src/MEDS_tabular_automl/configs/"
-    ):  # path to config.yaml
+    with initialize(version_base=None, config_path="../src/MEDS_tabular_automl/configs/"):
         overrides = [f"{k}={v}" for k, v in cache_config.items()]
-        cfg = compose(config_name="task_specific_caching", overrides=overrides)  # config.yaml
+        cfg = compose(config_name="task_specific_caching", overrides=overrides)
 
+    # Create fake labels
     df = get_unique_time_events_df(get_events_df(all_data.lazy(), feature_columns)).collect()
     pseudo_labels = pl.Series(([0, 1] * df.shape[0])[: df.shape[0]])
     df = df.with_columns(pl.Series(name="boolean_value", values=pseudo_labels))
@@ -229,17 +248,54 @@ def test_integration(tmp_path):
         cache_config,
         "task_specific_caching",
     )
+    for split in split_json.keys():
+        for window in cfg.tabularization.window_sizes:
+            for agg in cfg.tabularization.aggs:
+                if agg.startswith("static"):
+                    if window != cfg.tabularization.window_sizes[0]:
+                        continue
+                    expected_fp = Path(cfg.output_tabularized_cache_dir) / split / "none" / f"{agg}.npz"
+                else:
+                    expected_fp = Path(cfg.output_tabularized_cache_dir) / split / window / f"{agg}.npz"
+                output_files = list_subdir_files(str(Path(cfg.output_tabularized_cache_dir).resolve()), "npz")
+                assert expected_fp in output_files, f"Missing {expected_fp}"
+    [each for each in output_files if "0/30d" in str(each) and "code/count" in str(each)]
+    assert (
+        len(list_subdir_files(cfg.output_tabularized_cache_dir, "npz"))
+        == expected_num_time_tabs + expected_num_static_tabs
+    )
+
     stderr, stdout = run_command(
-        "meds-tab-model",
+        "meds-tab-cache-task",
         [
             "--multirun",
-            f"tabularization.window_sizes={stdout_ws.strip()}",
             f"tabularization.aggs={stdout_agg.strip()}",
-            "hydra.sweeper.n_jobs=5",
-            "hydra.sweeper.n_trials=10",
         ],
         cache_config,
-        "xgboost-model",
+        "task_specific_caching",
     )
-    assert "The best model can be found at" in stderr
-    assert "Performance of best model:" in stderr
+
+    for model in [
+        "xgboost",
+        "knn_classifier",
+        "logistic_regression",
+        "random_forest_classifier",
+        "sgd_classifier",
+    ]:
+        model_config = {
+            **shared_config,
+            "tabularization.min_code_inclusion_count": 1,
+            "tabularization.window_sizes": "[30d,365d,full]",
+            "task_name": "test_task",
+            "output_model_dir": str(output_model_dir.resolve()),
+            "model_launcher": model,
+            "hydra.sweeper.n_trials": 1,
+        }
+        overrides = [f"tabularization.aggs={stdout_agg.strip()}"]
+        if model == "autogluon":
+            script = "meds-tab-autogluon"
+        else:
+            script = "meds-tab-model"
+            overrides = ["--multirun"] + overrides
+
+        stderr, stdout = run_command(script, overrides, model_config, f"launch_model_{model}")
diff --git a/tests/test_tabularize.py b/tests/test_tabularize.py
index d110121..006252d 100644
--- a/tests/test_tabularize.py
+++ b/tests/test_tabularize.py
@@ -2,16 +2,14 @@
 
 root = rootutils.setup_root(__file__, dotenv=True, pythonpath=True, cwd=True)
 
-import importlib.util
+import importlib
 import json
-import os
-import subprocess
+import shutil
 from io import StringIO
 from pathlib import Path
 
 import polars as pl
 from hydra import compose, initialize
-from hydra.core.hydra_config import HydraConfig
 from loguru import logger
 
 from MEDS_tabular_automl.describe_codes import get_feature_columns
@@ -150,12 +148,14 @@
 
 
 def test_tabularize(tmp_path):
-    MEDS_cohort_dir = Path(tmp_path) / "MEDS_cohort_dir"
-    output_cohort_dir = Path(tmp_path) / "output_cohort_dir"
+    input_dir = Path(tmp_path) / "input_dir"
+    output_dir = Path(tmp_path) / "output_dir"
+    input_label_dir = Path(tmp_path) / "label_dir"
+    output_model_dir = Path(tmp_path) / "output_model_dir"
 
     shared_config = {
-        "MEDS_cohort_dir": str(MEDS_cohort_dir.resolve()),
-        "output_cohort_dir": str(output_cohort_dir.resolve()),
+        "input_dir": str(input_dir.resolve()),
+        "output_dir": str(output_dir.resolve()),
         "do_overwrite": False,
         "seed": 1,
         "hydra.verbose": True,
@@ -165,35 +165,34 @@ def test_tabularize(tmp_path):
 
     describe_codes_config = {**shared_config}
 
-    with initialize(
-        version_base=None, config_path="../src/MEDS_tabular_automl/configs/"
-    ):  # path to config.yaml
+    with initialize(version_base=None, config_path="../src/MEDS_tabular_automl/configs/"):
         overrides = [f"{k}={v}" for k, v in describe_codes_config.items()]
-        cfg = compose(config_name="describe_codes", overrides=overrides)  # config.yaml
+        cfg = compose(config_name="describe_codes", overrides=overrides)
 
     # Create the directories
-    (output_cohort_dir / "data").mkdir(parents=True, exist_ok=True)
+    (output_dir).mkdir(parents=True, exist_ok=True)
 
     # Store MEDS outputs
     all_data = []
     for split, data in MEDS_OUTPUTS.items():
-        file_path = output_cohort_dir / "data" / f"{split}.parquet"
-        file_path.parent.mkdir(exist_ok=True)
+        file_path = input_dir / f"{split}.parquet"
+        file_path.parent.mkdir(exist_ok=True, parents=True)
         df = pl.read_csv(StringIO(data)).with_columns(pl.col("time").str.to_datetime("%Y-%m-%dT%H:%M:%S%.f"))
         df.write_parquet(file_path)
         all_data.append(df)
+        assert file_path.exists()
 
     all_data = pl.concat(all_data, how="diagonal_relaxed").sort(by=["subject_id", "time"])
 
     # Check the files are not empty
     meds_files = list_subdir_files(Path(cfg.input_dir), "parquet")
     assert (
-        len(list_subdir_files(Path(cfg.input_dir).parent, "parquet")) == 4
+        len(list_subdir_files(Path(cfg.input_dir), "parquet")) == 4
     ), "MEDS train split Data Files Should be 4!"
     for f in meds_files:
         assert pl.read_parquet(f).shape[0] > 0, "MEDS Data Tabular Dataframe Should not be Empty!"
     split_json = json.load(StringIO(SPLITS_JSON))
-    splits_fp = output_cohort_dir / ".shards.json"
+    splits_fp = input_dir / ".shards.json"
     json.dump(split_json, splits_fp.open("w"))
     # Step 1: Describe Codes - compute code frequencies
     describe_codes.main(cfg)
@@ -214,14 +213,12 @@ def test_tabularize(tmp_path):
         "tabularization.window_sizes": "[30d,365d,full]",
     }
 
-    with initialize(
-        version_base=None, config_path="../src/MEDS_tabular_automl/configs/"
-    ):  # path to config.yaml
+    with initialize(version_base=None, config_path="../src/MEDS_tabular_automl/configs/"):
         overrides = [f"{k}={v}" for k, v in tabularize_static_config.items()]
-        cfg = compose(config_name="tabularization", overrides=overrides)  # config.yaml
+        cfg = compose(config_name="tabularization", overrides=overrides)
     tabularize_static.main(cfg)
 
-    output_dir = Path(cfg.output_cohort_dir) / "tabularize"
+    output_dir = Path(cfg.output_dir) / "tabularize"
 
     output_files = list(output_dir.glob("**/static/**/*.npz"))
     actual_files = [get_shard_prefix(output_dir, each) + ".npz" for each in output_files]
@@ -280,35 +277,34 @@ def test_tabularize(tmp_path):
         assert ts_matrix.shape[0] == expected_num_rows, (
             f"Time-Series Data matrix Should have {expected_num_rows}" f" rows but has {ts_matrix.shape[0]}!"
         )
-    output_files = list_subdir_files(str(output_dir.resolve()), "npz")
+    output_files = list_subdir_files(str(Path(cfg.output_tabularized_dir).resolve()), "npz")
     for split in split_json.keys():
         for window in cfg.tabularization.window_sizes:
             for agg in cfg.tabularization.aggs:
                 if agg.startswith("static"):
                     if window != cfg.tabularization.window_sizes[0]:
                         continue
-                    expected_fp = Path(cfg.output_dir) / split / "none" / f"{agg}.npz"
+                    expected_fp = Path(cfg.output_tabularized_dir) / split / "none" / f"{agg}.npz"
                 else:
-                    expected_fp = Path(cfg.output_dir) / split / window / f"{agg}.npz"
+                    expected_fp = Path(cfg.output_tabularized_dir) / split / window / f"{agg}.npz"
                 assert expected_fp in output_files, f"Missing {expected_fp}"
     expected_num_time_tabs = (
         NUM_SHARDS * len(cfg.tabularization.window_sizes) * (len(cfg.tabularization.aggs) - 2)
     )
     expected_num_static_tabs = NUM_SHARDS * 2
     assert len(list_subdir_files(cfg.output_dir, "npz")) == expected_num_time_tabs + expected_num_static_tabs
-    cfg.output_dir
     # Step 3: Cache Task data
     cache_config = {
         **shared_config,
         "tabularization.min_code_inclusion_count": 1,
         "tabularization.window_sizes": "[30d,365d,full]",
+        "task_name": "test_task",
+        "input_label_dir": str(input_label_dir.resolve()),
     }
 
-    with initialize(
-        version_base=None, config_path="../src/MEDS_tabular_automl/configs/"
-    ):  # path to config.yaml
+    with initialize(version_base=None, config_path="../src/MEDS_tabular_automl/configs/"):
         overrides = [f"{k}={v}" for k, v in cache_config.items()]
-        cfg = compose(config_name="task_specific_caching", overrides=overrides)  # config.yaml
+        cfg = compose(config_name="task_specific_caching", overrides=overrides)
 
     # Create fake labels
     df = get_unique_time_events_df(get_events_df(all_data.lazy(), feature_columns)).collect()
@@ -327,133 +323,102 @@ def test_tabularize(tmp_path):
                 if agg.startswith("static"):
                     if window != cfg.tabularization.window_sizes[0]:
                         continue
-                    expected_fp = Path(cfg.output_dir) / split / "none" / f"{agg}.npz"
+                    expected_fp = Path(cfg.output_tabularized_cache_dir) / split / "none" / f"{agg}.npz"
                 else:
-                    expected_fp = Path(cfg.output_dir) / split / window / f"{agg}.npz"
-                output_files = list_subdir_files(str(Path(cfg.output_dir).resolve()), "npz")
+                    expected_fp = Path(cfg.output_tabularized_cache_dir) / split / window / f"{agg}.npz"
+                output_files = list_subdir_files(str(Path(cfg.output_tabularized_cache_dir).resolve()), "npz")
                 assert expected_fp in output_files, f"Missing {expected_fp}"
     [each for each in output_files if "0/30d" in str(each) and "code/count" in str(each)]
-    assert len(list_subdir_files(cfg.output_dir, "npz")) == expected_num_time_tabs + expected_num_static_tabs
+    assert (
+        len(list_subdir_files(cfg.output_tabularized_cache_dir, "npz"))
+        == expected_num_time_tabs + expected_num_static_tabs
+    )
 
-    xgboost_config_kwargs = {
+    xgboost_config = {
         **shared_config,
         "tabularization.min_code_inclusion_count": 1,
         "tabularization.window_sizes": "[30d,365d,full]",
+        "task_name": "test_task",
+        "output_model_dir": str(output_model_dir.resolve()),
     }
 
-    with initialize(
-        version_base=None, config_path="../src/MEDS_tabular_automl/configs/"
-    ):  # path to config.yaml
-        overrides = ["model=xgboost"] + [f"{k}={v}" for k, v in xgboost_config_kwargs.items()]
-        cfg = compose(
-            config_name="launch_model", overrides=overrides, return_hydra_config=True
-        )  # config.yaml
-
-    output_dir = Path(cfg.output_cohort_dir) / "model"
+    with initialize(version_base=None, config_path="../src/MEDS_tabular_automl/configs/"):
+        overrides = ["model_launcher=xgboost"] + [f"{k}={v}" for k, v in xgboost_config.items()]
+        cfg = compose(config_name="launch_model", overrides=overrides, return_hydra_config=True)
 
-    HydraConfig().set_config(cfg)
     launch_model.main(cfg)
-    output_files = list(output_dir.glob("**/*.json"))
-    assert len(output_files) == 1
 
-    log_dir = Path(cfg.model_logging.model_log_dir)
-    log_csv = list(log_dir.glob("**/*.log"))
-    assert len(log_csv) == 2
+    expected_output_dir = Path(cfg.output_model_dir)
+    output_files = list(expected_output_dir.glob("**/*.json"))
+    assert len(output_files) == 2
+
+    log_dir = Path(cfg.path.model_log_dir)
+    log_files = list(log_dir.glob("**/*.log"))
+    assert len(log_files) == 1
+    shutil.rmtree(expected_output_dir)
 
-    sklearnmodel_config_kwargs = {
+    sklearnmodel_config = {
         **shared_config,
         "tabularization.min_code_inclusion_count": 1,
         "tabularization.window_sizes": "[30d,365d,full]",
+        "task_name": "test_task",
+        "output_model_dir": str(output_model_dir.resolve()),
     }
 
-    with initialize(
-        version_base=None, config_path="../src/MEDS_tabular_automl/configs/"
-    ):  # path to config.yaml
-        overrides = ["model=sgd_classifier"] + [f"{k}={v}" for k, v in sklearnmodel_config_kwargs.items()]
-        cfg = compose(config_name="launch_model", overrides=overrides)  # config.yaml
-
-    output_dir = Path(cfg.output_cohort_dir) / "model"
+    with initialize(version_base=None, config_path="../src/MEDS_tabular_automl/configs/"):
+        overrides = ["model_launcher=sgd_classifier"] + [f"{k}={v}" for k, v in sklearnmodel_config.items()]
+        cfg = compose(config_name="launch_model", overrides=overrides)
 
     launch_model.main(cfg)
-    output_files = list(output_dir.glob("**/*.pkl"))
+
+    expected_output_dir = Path(cfg.output_model_dir)
+    output_files = list(expected_output_dir.glob("**/*.pkl"))
     assert len(output_files) == 1
+    shutil.rmtree(expected_output_dir)
 
-    sklearnmodel_config_kwargs = {
+    sklearnmodel_config = {
         **shared_config,
         "tabularization.min_code_inclusion_count": 1,
         "tabularization.window_sizes": "[30d,365d,full]",
-        "model_params.iterator.keep_data_in_memory": False,
-        "model_saving.model_dir": "${output_cohort_dir}/model_online/model_${now:%Y-%m-%d_%H-%M-%S}",
+        "data_loading_params.keep_data_in_memory": False,
+        "task_name": "test_task",
+        "output_model_dir": str(output_model_dir.resolve()),
     }
 
-    with initialize(
-        version_base=None, config_path="../src/MEDS_tabular_automl/configs/"
-    ):  # path to config.yaml
-        overrides = ["model=sgd_classifier"] + [f"{k}={v}" for k, v in sklearnmodel_config_kwargs.items()]
-        cfg = compose(config_name="launch_model", overrides=overrides)  # config.yaml
+    with initialize(version_base=None, config_path="../src/MEDS_tabular_automl/configs/"):
+        overrides = ["model_launcher=sgd_classifier"] + [f"{k}={v}" for k, v in sklearnmodel_config.items()]
+        cfg = compose(config_name="launch_model", overrides=overrides)
+        assert not cfg.data_loading_params.keep_data_in_memory
+        assert cfg.model_launcher.data_loading_params.binarize_task
 
-    output_dir = Path(cfg.output_cohort_dir) / "model_online"
+    output_dir = Path(cfg.output_dir) / "model_online"
 
     launch_model.main(cfg)
-    output_files = list(output_dir.glob("**/*.pkl"))
+
+    expected_output_dir = Path(cfg.output_model_dir)
+    output_files = list(expected_output_dir.glob("**/*.pkl"))
     assert len(output_files) == 1
+    shutil.rmtree(expected_output_dir)
 
     if importlib.util.find_spec("autogluon") is not None:
         import autogluon as ag
 
         from MEDS_tabular_automl.scripts import launch_autogluon
 
-        autogluon_config_kwargs = {
+        autogluon_config = {
             **shared_config,
             "tabularization.min_code_inclusion_count": 1,
             "tabularization.window_sizes": "[30d,365d,full]",
-            "model_params.iterator.keep_data_in_memory": False,
-            "model_saving.model_dir": "${output_cohort_dir}/model_online/model_${now:%Y-%m-%d_%H-%M-%S}",
+            "task_name": "test_task",
+            "output_model_dir": str(output_model_dir.resolve()),
         }
 
-        with initialize(
-            version_base=None, config_path="../src/MEDS_tabular_automl/configs/"
-        ):  # path to config.yaml
-            overrides = [f"{k}={v}" for k, v in autogluon_config_kwargs.items()]
-            cfg = compose(config_name="launch_autogluon", overrides=overrides)  # config.yaml
-
-        output_dir = Path(cfg.output_cohort_dir) / "model_online"
+        with initialize(version_base=None, config_path="../src/MEDS_tabular_automl/configs/"):
+            overrides = ["model_launcher=autogluon"] + [f"{k}={v}" for k, v in autogluon_config.items()]
+            cfg = compose(config_name="launch_model", overrides=overrides)
 
         launch_autogluon.main(cfg)
-        output_files = list(output_dir.glob("*"))
-        most_recent_file = max(output_files, key=os.path.getmtime)
-        ag.tabular.TabularPredictor.load(most_recent_file)
-
-
-def run_command(script: str, args: list[str], hydra_kwargs: dict[str, str], test_name: str):
-    command_parts = [script] + args + [f"{k}={v}" for k, v in hydra_kwargs.items()]
-    command_out = subprocess.run(" ".join(command_parts), shell=True, capture_output=True)
-    stderr = command_out.stderr.decode()
-    stdout = command_out.stdout.decode()
-    if command_out.returncode != 0:
-        raise AssertionError(f"{test_name} failed!\nstdout:\n{stdout}\nstderr:\n{stderr}")
-    return stderr, stdout
-
-
-def test_xgboost_config():
-    MEDS_cohort_dir = "blah"
-    stderr, stdout_ws = run_command("generate-subsets", ["[30d]"], {}, "generate-subsets window_sizes")
-    stderr, stdout_agg = run_command("generate-subsets", ["[static/present]"], {}, "generate-subsets aggs")
-    xgboost_config_kwargs = {
-        "MEDS_cohort_dir": MEDS_cohort_dir,
-        "output_cohort_dir": "blah",
-        "do_overwrite": False,
-        "seed": 1,
-        "hydra.verbose": True,
-        "tqdm": False,
-        "loguru_init": True,
-        "tabularization.min_code_inclusion_count": 1,
-        "tabularization.window_sizes": f"{stdout_ws.strip()}",
-    }
 
-    with initialize(
-        version_base=None, config_path="../src/MEDS_tabular_automl/configs/"
-    ):  # path to config.yaml
-        overrides = [f"{k}={v}" for k, v in xgboost_config_kwargs.items()]
-        cfg = compose(config_name="launch_model", overrides=overrides)  # config.yaml
-    assert cfg.tabularization.window_sizes
+        expected_output_filepath = Path(cfg.output_model_dir) / "predictor.pkl"
+        assert expected_output_filepath.is_file()
+        ag.tabular.TabularPredictor.load(cfg.output_model_dir)