ipums · riley-harper · Nov 14, 2024 · Nov 15, 2024 · Nov 15, 2024 · Nov 15, 2024
diff --git a/.gitignore b/.gitignore
@@ -9,6 +9,7 @@ venv
 sphinx-docs/_*
 .coverage
 coverage_*
+.hypothesis/
 
 # Scala
 scala_jar/target

diff --git a/docs/.buildinfo b/docs/.buildinfo
@@ -1,4 +1,4 @@
 # Sphinx build info version 1
 # This file records the configuration used when building these files. When it is not found, a full rebuild will be done.
-config: 3d084ea912736a6c4043e49bc2b58167
+config: 51aa15e7a138f908be12c347931eec38
 tags: 645f666f9bcd5a90fca523b33c5a78b7
diff --git a/docs/.buildinfo.bak b/docs/.buildinfo.bak
@@ -1,4 +1,4 @@
 # Sphinx build info version 1
-# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
-config: 96d8a216541a8e03e59f47f661841dd9
+# This file records the configuration used when building these files. When it is not found, a full rebuild will be done.
+config: 346c22873853f51d4bd34095fc5e3354
 tags: 645f666f9bcd5a90fca523b33c5a78b7
diff --git a/docs/_sources/column_mappings.md.txt b/docs/_sources/column_mappings.md.txt
@@ -288,25 +288,27 @@ transforms = [
 
 ### mapping
 
-Map single or multiple values to a single output value, otherwise known as a "recoding."
+Explicitly map from input values to output values. This is also known as a "recoding".
+Input values which do not appear in the mapping are unchanged. By default, the output
+column is of type string, but you can set `output_type = "int"` to cast the output
+column to type integer instead.
 
 Maps T → U.
 
-```
+```toml
 [[column_mappings]]
 column_name = "birthyr"
 alias = "clean_birthyr"
-transforms = [
-    {
-        type = "mapping",
-        values = [
-            {"from"=[9999,1999], "to" = ""},
-            {"from" = -9998, "to" = 9999}
-        ]
-    }
-]
+
+[[column_mappings.transforms]]
+type = "mapping"
+mappings = {9999 = "", 1999 = "", "-9998" = "9999"}
+output_type = "int"
 ```
 
+*Changed in version 4.0.0: The deprecated `values` key is no longer supported.
+Please use the `mappings` key documented above instead.*
+
 ### substring
 
 Replace a column with a substring of the data in the column.

diff --git a/docs/_sources/config.md.txt b/docs/_sources/config.md.txt
@@ -13,8 +13,8 @@
 12. [Household Comparisons](#household-comparisons)
 13. [Comparison Features](#comparison-features)
 14. [Pipeline-Generated Features](#pipeline-generated-features)
-15. [Training and Models](#training-and-models)
-16. [Household Training and Models](#household-training-and-models)
+15. [Training and Model Exploration](#training-and-model-exploration)
+16. [Household Training and Model Exploration](#household-training-and-model-exploration)
 
 ## Basic Config File
 
@@ -334,8 +334,7 @@ split_by_id_a = true
 decision = "drop_duplicate_with_threshold_ratio"
 
 n_training_iterations = 2
-output_suspicious_TD = true
-param_grid = true
+model_parameter_search = {strategy = "grid"}
 model_parameters = [ 
     { type = "random_forest", maxDepth = [7], numTrees = [100], threshold = [0.05, 0.005], threshold_ratio = [1.2, 1.3] },
     { type = "logistic_regression", threshold = [0.50, 0.65, 0.80], threshold_ratio = [1.0, 1.1] }
@@ -361,8 +360,7 @@ split_by_id_a = true
 decision = "drop_duplicate_with_threshold_ratio"
 
 n_training_iterations = 10
-output_suspicious_TD = true
-param_grid = false
+model_parameter_search = {strategy = "explicit"}
 model_parameters = [
     { type = "random_forest", maxDepth = 6, numTrees = 50, threshold = 0.5, threshold_ratio = 1.0 },
     { type = "probit", threshold = 0.5, threshold_ratio = 1.0 }
@@ -730,7 +728,7 @@ categorical = true
 splits = [-1,0,6,11,9999]
 ```
 
-## Training and [models](models)
+## Training and [Model Exploration](model_exploration)
 
 * Header name: `training`
 * Description: Specifies the training data set as well as a myriad of attributes related to training a model including the dependent variable within that dataset, the independent variables created from the `comparison_features` section, and the different models you want to use for either model exploration or scoring.  
@@ -740,21 +738,21 @@ splits = [-1,0,6,11,9999]
   * `dataset` -- Type: `string`. Location of the training dataset. Must be a csv file.
   * `dependent_var` -- Type: `string`. Name of dependent variable in training dataset.
   * `independent_vars` -- Type: `list`. List of independent variables to use in the model. These must be either part of `pipeline_features` or `comparison_features`.
-  * `chosen_model` -- Type: `object`. The model to train with in the `training` task and score with in the `matching` task. See the [models](models) section for more information on model specifications.
+  * `chosen_model` -- Type: `object`. The model to train with in the `training` task and score with in the `matching` task. See the [Models](models) section for more information on model specifications.
   * `threshold` -- Type: `float`. The threshold for which to accept model probability values as true predictions.  Can be used to specify a threshold to use for all models, or can be specified within each `chosen_model` and `model_parameters` specification.
-  * `decision` -- Type: `string`. Optional. Specifies which decision function to use to create the final prediction. The first option is `drop_duplicate_a`, which drops any links for which a record in the `a` data set has a predicted match more than one time. The second option is `drop_duplicate_with_threshold_ratio` which only takes links for which the `a` record has the highest probability out of any other potential links, and the second best link for the `a` record is less than the `threshold_ratio`.
   * `threshold_ratio` -- Type: `float`. Optional. For use when `decision` is `drop_duplicate_with_threshold_ratio` . Specifies the smallest possible ratio to accept between a best and second best link for a given record.  Can be used to specify a threshold ratio (beta threshold) to use for all models.  Alternatively, unique threshold ratios can be specified in each individual `chosen_model` and `model_parameters` specification.
-  * `model_parameters` -- Type: `list`. Specifies models to test out in the `model_exploration` task. See the [models](models) section for more information on model specifications.
-  * `param_grid` -- Type: `boolean`. Optional. If you would like to evaluate multiple hyper-parameters for a single model type in your `model_parameters` specification, you can give hyper-parameter inputs as arrays of length >= 1 instead of integers to allow one model per row specification with multiple model eval outputs.
+  * `decision` -- Type: `string`. Optional. Specifies which decision function to use to create the final prediction. The first option is `drop_duplicate_a`, which drops any links for which a record in the `a` data set has a predicted match more than one time. The second option is `drop_duplicate_with_threshold_ratio` which only takes links for which the `a` record has the highest probability out of any other potential links, and the second best link for the `a` record is less than the `threshold_ratio`.
   * `score_with_model` -- Type: `boolean`. If set to false, will skip the `apply_model` step of the matching task. Use this if you want to use the `run_all_steps` command and are just trying to generate potential links, such as for the creation of training data.
-  * `n_training_iterations` -- Type: `integer`. Optional; default value is 10. The number of training iterations to use during the `model_exploration` task.
   * `scale_data` -- Type: `boolean`.  Optional. Whether to scale the data as part of the machine learning pipeline.
   * `use_training_data_features` -- Type: `boolean`. Optional. If the identifiers in the training data set are not present in your raw input data, you will need to set this to `true`, or training features will not be able to be generated, giving null column errors.  For example, if the training data set you are using has individuals from 1900 and 1910, but you are about to train a model to score the 1930-1940 potential matches, you need this to be set to `true` or it will fail, since the individual IDs are not present in the 1930 and 1940 raw input data.  If you were about to train a model to score the 1900-1910 potential matches with this same training set, it would be best to set this to `false`, so you can be sure the training features are created from scratch to match your exact current configuration settings, although if you know the features haven't changed, you could set it to `true` to save a small amount of processing time.
-  * `output_suspicious_TD` -- Type: `boolean`.  Optional.  Used in the `model_exploration` link task.  Outputs tables of potential matches that the model repeatedly scores differently than the match value given by the training data.  Helps to identify false positives/false negatives in the training data, as well as areas that need additional training feature coverage in the model, or need increased representation in the training data set.
   * `split_by_id_a` -- Type: `boolean`.  Optional.  Used in the `model_exploration` link task.  When set to true, ensures that all potential matches for a given individual with ID_a are grouped together in the same train-test-split group. For example, if individual histid_a "A304BT" has three potential matches in the training data, one each to histid_b "B200", "C201", and "D425", all of those potential matches would either end up in the "train" split or the "test" split when evaluating the model performance.
   * `feature_importances` -- Type: `boolean`. Optional.  Whether to record
     feature importances or coefficients for the training features when training
     the ML model. Set this to true to enable training step 3.
+  * `model_parameters` -- Type: `list`. Specifies models to test out in the `model_exploration` task. See the [Model Exploration](model_exploration) page for a detailed description of how this works.
+  * `model_parameter_search` -- Type: `object`. Specifies which strategy hlink should
+  use to generate test models for [Model Exploration](model_exploration).
+  * `n_training_iterations` -- Type: `integer`. Optional; default value is 10. The number of outer folds to use during the `model_exploration` task. See [here](model_exploration.html#the-details) for more details.
 
 
 ```
@@ -764,7 +762,6 @@ scale_data = false
 dataset = "/path/to/1900_1910_training_data_20191023.csv"
 dependent_var = "match"
 use_training_data_features = false
-output_suspicious_TD = true
 split_by_id_a = true
 
 score_with_model = true
@@ -773,7 +770,7 @@ feature_importances = true
 decision = "drop_duplicate_with_threshold_ratio"
 
 n_training_iterations = 10
-param_grid = false
+model_parameter_search = {strategy = "explicit"}
 model_parameters = [
   { type = "random_forest", maxDepth = 6, numTrees = 50 },
   { type = "probit", threshold = 0.5}
@@ -782,7 +779,7 @@ model_parameters = [
 chosen_model = { type = "logistic_regression", threshold = 0.5, threshold_ratio = 1.0 }
 ```
 
-## Household training and models
+## Household Training and [Model Exploration](model_exploration)
 
 * Header name: `hh_training`
 * Description: Specifies the household training data set as well as a myriad of attributes related to training a model including the dependent var within that data set, the independent vars created from the `comparison_features` section, and the different models you want to use.  
@@ -804,13 +801,12 @@ scale_data = false
 dataset = "/path/to/hh_training_data_1900_1910.csv"
 dependent_var = "match"
 use_training_data_features = false
-output_suspicious_TD = true
 split_by_id_a = true
 score_with_model = true
 feature_importances = true
 decision = "drop_duplicate_with_threshold_ratio"
 
-param_grid = true
+model_parameter_search = {strategy = "grid"}
 n_training_iterations = 10
 model_parameters = [
     { type = "logistic_regression", threshold = [0.5], threshold_ratio = [1.1]},

diff --git a/docs/_sources/index.rst.txt b/docs/_sources/index.rst.txt
@@ -30,4 +30,5 @@ Configuration API
    Feature Selection <feature_selection_transforms.md>
    Pipeline Features <pipeline_features.md>
    substitutions
+   model_exploration
    models
diff --git a/docs/_sources/model_exploration.md.txt b/docs/_sources/model_exploration.md.txt
@@ -0,0 +1,195 @@
+# Model Exploration
+
+## Overview
+
+The model exploration task provides a way to try out different types of machine
+learning models and sets of parameters to those models. It tests those models
+on splits of the training data and outputs information on the performance of
+the models. The purpose of model exploration is to help you choose a model that
+performs well without having to test each model individually on the entire
+input datasets. If you're interested in the exact workings of the model exploration
+algorithm, see the [Details](#the-details) section below.
+
+Model exploration uses several configuration attributes listed in the `training`
+section because it is closely related to `training`.
+
+## Searching for Model Parameters
+
+Part of the process of model exploration is searching for model parameters which
+give good results on the training data. Hlink supports three strategies for model
+parameter searches, controlled by the `training.model_parameter_search` table.
+
+### Explicit Search (`strategy = "explicit"`)
+
+An explicit model parameter search lists out all of the parameter combinations
+to be tested. Each element of the `training.model_parameters` list becomes one
+set of parameters to evaluate. This is the simplest search strategy and is hlink's
+default behavior.
+
+This example `training` section uses an explicit search over two sets of model parameters.
+Model exploration will train two random forest models. The first will have a
+`maxDepth` of 3 and `numTrees` of 50, and the second will have a `maxDepth` of 3
+and `numTrees` of 20.
+
+```toml
+[training.model_parameter_search]
+strategy = "explicit"
+
+[[training.model_parameters]]
+type = "random_forest"
+maxDepth = 3
+numTrees = 50
+
+[[training.model_parameters]]
+type = "random_forest"
+maxDepth = 3
+numTrees = 20
+```
+
+### Grid Search (`strategy = "grid"`)
+
+A grid search takes multiple values for each model parameter and generates one
+model for each possible combination of the given parameters. This is often much more
+compact than writing out all of the possible combinations in an explicit search.
+
+For example, this `training` section generates 30 combinations of model
+parameters for testing. The first has a `maxDepth` of 1 and `numTrees` of 20,
+the second has a `maxDepth` of 1 and `numTrees` of 30, and so on.
+
+```toml
+[training.model_parameter_search]
+strategy = "grid"
+
+[[training.model_parameters]]
+type = "random_forest"
+maxDepth = [1, 2, 3, 5, 10]
+numTrees = [20, 30, 40, 50, 60, 70]
+```
+
+Although grid search is more compact than explicitly listing out all of the model
+parameters, it can be quite time-consuming to check every possible combination of
+model parameters. Randomized search, described below, can be a more efficient way
+to evaluate models with large numbers of parameters or large parameter ranges.
+
+
+### Randomized Search (`strategy = "randomized"`)
+
+*Added in version 4.0.0.*
+
+A randomized parameter search generates model parameter settings by sampling each
+parameter from a distribution or set. The number of samples is an additional parameter
+to the strategy. This separates the size of the search space from the number of samples
+taken, making a randomized search more flexible than a grid search. The downside of
+this is that, unlike a grid search, a randomized search does not necessarily test
+all of the possible values given for each parameter. It is necessarily non-exhaustive.
+
+In a randomized search, each model parameter may take one of 3 forms:
+
+* A list, which is a set of values to sample from with replacement. Each value has an equal chance
+of being chosen for each sample.
+
+```toml
+[[training.model_parameters]]
+type = "random_forest"
+numTrees = [20, 30, 40]
+```
+
+* A single value, which "pins" the model parameter to always be that value. This
+is syntactic sugar for sampling from a list with one element.
+
+```toml
+[[training.model_parameters]]
+type = "random_forest"
+# numTrees will always be 30.
+# This is equivalent to numTrees = [30].
+numTrees = 30
+```
+
+* A table defining a distribution from which to sample the parameter. The available
+distributions are `"randint"`, to choose a random integer from a range, `"uniform"`,
+to choose a random floating-point number from a range, and `"normal"`, to choose
+a floating-point number from a normal distribution with a given mean and standard
+deviation.
+
+For example, this `training` section generates 20 model parameter combinations
+for testing, using a randomized search. Each of the three given model parameters
+uses a different type of distribution.
+
+```toml
+[training.model_parameter_search]
+strategy = "randomized"
+num_samples = 20
+
+[[training.model_parameters]]
+type = "random_forest"
+numTrees = {distribution = "randint", low = 20, high = 70}
+minInfoGain = {distribution = "uniform", low = 0.0, high = 0.3}
+subsamplingRate = {distribution = "normal", mean = 1.0, standard_deviation = 0.2}
+```
+
+### The `training.param_grid` Attribute
+
+As of version 4.0.0, the `training.param_grid` attribute is deprecated. Please use
+`training.model_parameter_search` instead, as it is more flexible and supports additional
+parameter search strategies. Prior to version 4.0.0, you will need to use `training.param_grid`.
+
+`param_grid` has a direct mapping to `model_parameter_search`.
+
+```toml
+[training]
+param_grid = true
+```
+
+is equivalent to
+
+```toml
+[training]
+model_parameter_search = {strategy = "grid"}
+```
+
+and
+
+```toml
+[training]
+param_grid = false
+```
+
+is equivalent to
+
+```toml
+[training]
+model_parameter_search = {strategy = "explicit"}
+```
+
+### Types and Thresholds
+
+
+There are 3 attributes which are hlink-specific and are not passed through as model parameters.
+* `type` is the name of the model type.
+* `threshold` and `threshold_ratio` control how hlink classifies potential matches
+based on the probabilistic output of the models. They may each be either a float
+or a list of floats, and hlink will always use a grid strategy to generate the
+set of test combinations for these parameters.
+
+For more details, please see the [Models](models) page and the [Details](#the-details)
+section below.
+
+## The Details
+
+The current model exploration implementation uses a technique called nested cross-validation to evaluate each model which the search strategy generates. The algorithm follows this basic outline.
+
+Let `N` be the value of `training.n_training_iterations`.
+Let `J` be 3. (Currently `J` is hard-coded).
+
+1. Split the prepared training data into `N` **outer folds**. This forms a partition of the training data into `N` distinct pieces, each of roughly equal size.
+2. Choose the first **outer fold**.
+3. Combine the `N - 1` other **outer folds** into the set of outer training data.
+4. Split the outer training data into `J` **inner folds**. This forms a partition of the training data into `J` distinct pieces, each of roughly equal size.
+5. Choose the first **inner fold**.
+6. Combine the `J - 1` other **inner folds** into the test of inner training data.
+7. Train, test, and score all of the models using the inner training data and the first **inner fold** as the test data.
+8. Repeat steps 5 - 7 for each other **inner fold**.
+9. After finishing all of the **inner folds**, choose the single model with the best aggregate score over those folds.
+10. For each setting of `threshold` and `threshold_ratio`, train the best model on the outer training data and the chosen **outer fold**. Collect metrics on the performance of the model based on its confusion matrix.
+11. Repeat steps 2-10 for each other **outer fold**.
+12. Report on all of the metrics gathered for the best-scoring models.