Score auto correlation

Calculate autocorrelation based on the predictability of a factor level based on the n preceding factors
sweetpea-org · May 3, 2023 · 2d2bb85 · 2d2bb85
1 parent 17cc6ae
commit 2d2bb85
Show file tree

Hide file tree

Showing 3 changed files with 257 additions and 0 deletions.
diff --git a/acceptance/test_auto_correlation_score.py b/acceptance/test_auto_correlation_score.py
@@ -0,0 +1,42 @@
+from sweetpea import *
+
+samples = [
+    {
+        'color': ['red', 'green', 'red', 'green', 'red', 'green'],
+        'word': ['red', 'green', 'red', 'green', 'red', 'red'],
+        'congruency': ['con', 'con', 'inc', 'con', 'inc', 'con']
+    },
+    {
+        'color': ['red', 'green', 'red', 'green', 'red', 'green'],
+        'word': ['red', 'red', 'green', 'red', 'red', 'green'],
+        'congruency': ['con', 'con', 'con', 'inc', 'con', 'con']
+    },
+    {
+        'color': ['green', 'red', 'green', 'red', 'green', 'red'],
+        'word': ['green', 'red', 'red', 'red', 'green', 'red'],
+        'congruency': ['con', 'inc', 'con', 'con', 'con', 'inc']
+    }]
+
+
+def test_score_auto_correlation_all():
+    # this is to prevent testing when sklearn is not installed:
+    try:
+        from sklearn.neural_network import MLPClassifier
+    except ImportError as e:
+        assert True
+        return
+    res = auto_correlation_scores_samples_between(samples)
+    assert 'color' in res.keys() and 'word' in res.keys() and 'congruency' in res.keys()
+
+
+def test_score_auto_correlation():
+    # this is to prevent testing when sklearn is not installed:
+    try:
+        from sklearn.neural_network import MLPClassifier
+    except ImportError as e:
+        assert True
+        return
+    res = auto_correlation_scores_samples_between(samples, ['color'])
+    assert 'color' in res.keys() and 'word' not in res.keys() and 'congruency' not in res.keys()
+
+
diff --git a/sweetpea/_internal/auto_correlation_score.py b/sweetpea/_internal/auto_correlation_score.py
@@ -0,0 +1,143 @@
+import random
+
+
+def _get_level_to_float_mapping(factor, sample) -> dict:
+    item = set()
+    for l in sample[factor]:
+        item.add(l)
+    item_ = sorted(item)
+    return {item_[i]: float(i) for i in range(len(item_))}
+
+
+def _convert_sample_levels_to_floats(factor_dict, sample) -> dict:
+    s_ = {}
+    for k in sample.keys():
+        s_[k] = [factor_dict[k][l] for l in sample[k]]
+    return s_
+
+
+def convert_samples(samples: list) -> list:
+    """Convert string levels in a sample to numbers"""
+    factors = set()
+    # get factors
+    for s in samples:
+        for f in s.keys():
+            factors.add(f)
+    factor_dict = {}
+    # get the level dict for every factor:
+    for s in samples:
+        for f in factors:
+            factor_dict[f] = _get_level_to_float_mapping(f, s)
+    res = []
+    for s in samples:
+        s_ = _convert_sample_levels_to_floats(factor_dict, s)
+        res.append(s_)
+    return res
+
+
+def convert_sample(sample: dict) -> dict:
+    factor_dict = {}
+    for f in sample.keys():
+        factor_dict[f] = _get_level_to_float_mapping(f, sample)
+    return _convert_sample_levels_to_floats(factor_dict, sample)
+
+
+def train_test_split_samples(samples: list, percentage: float = .8) -> tuple:
+    """split list of samples in train and test samples"""
+    samples_ = samples.copy()
+    random.shuffle(samples_)
+    split = int(percentage * len(samples_))
+    if split == 0 or split == len(samples_):
+        raise Exception('Train or test set empty')
+    return samples_[:split], samples_[split:]
+
+
+def create_x_y_sample(sample: dict, y_factor: str, k: int = 10) -> tuple:
+    """create the independent and dependent values in a sample"""
+    x_lists = []
+    y_list = sample[y_factor]
+    for key in sample.keys():
+        x_lists.append(sample[key])
+    k_ = min(len(y_list) // 2, k)
+    start = 0
+    end = k_
+    x_res = []
+    y_res = []
+    if len(y_list) <= k_:
+        raise Exception('predict distance to high in auto correlation test')
+    while end < len(y_list):
+        x_temp = []
+        for x in x_lists:
+            x_temp += x[start: end]
+        x_res.append(x_temp)
+        y_res.append(y_list[end])
+        start += 1
+        end += 1
+    return x_res, y_res
+
+
+def create_x_y_train_test_samples(samples: list, factor: str, percentage: float = .8, k: int = 10) -> tuple:
+    """create a list of train independent, train dependent, test independent and test dpendent variables"""
+    train_set, test_set = train_test_split_samples(samples, percentage)
+    x_train = []
+    y_train = []
+    x_test = []
+    y_test = []
+    for s in train_set:
+        x_train_, y_train_ = create_x_y_sample(s, factor, k)
+        x_train += x_train_
+        y_train += y_train_
+    for s in test_set:
+        x_test_, y_test_ = create_x_y_sample(s, factor, k)
+        x_test += x_test_
+        y_test += y_test_
+    return x_train, y_train, x_test, y_test
+
+
+def create_x_y_train_test_sample(sample: dict, factor: str, train_test_split: float = .8) -> tuple:
+    x_set, y_set = create_x_y_sample(sample, factor)
+    zipped = list(zip(x_set, y_set))
+    random.shuffle(zipped)
+    split = int(train_test_split * len(zipped))
+    if split == 0 or split == len(zipped):
+        raise Exception('Train or test set empty')
+    return [e[0] for e in zipped[:split]], \
+        [e[1] for e in zipped[:split]], \
+        [e[0] for e in zipped[split:]], \
+        [e[1] for e in zipped[split:]]
+
+
+def auto_correlation_score_factor_within(sample: dict, factor: str, train_test_split: float = .8,
+                                         k: int = 10, starts: int = 10) -> float:
+    """get the auto correlation score for a single factor for a samples (within)"""
+    try:
+        from sklearn.neural_network import MLPClassifier
+    except ImportError as e:
+        raise Exception(
+            'To use a auto correlation test, please install the scikit-learn package: pip install scikit-learn\n')
+    sample_converted = convert_sample(sample)
+    score_max = 0
+    for i in range(starts):
+        x_train, y_train, x_test, y_test = create_x_y_train_test_sample(sample_converted, factor, train_test_split)
+        clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(15,), random_state=1)
+        clf.fit(x_train, y_train)
+        score_max = max(score_max, clf.score(x_test, y_test))
+    return score_max
+
+
+def auto_correlation_score_factor_between(samples: list, factor: str, train_test_split: float = .8,
+                                          k: int = 10, starts: int = 10) -> float:
+    """get the auto correlation score for a single factor for a list of samples (between)"""
+    try:
+        from sklearn.neural_network import MLPClassifier
+    except ImportError as e:
+        raise Exception(
+            'To use a auto correlation test, please install the scikit-learn package: pip install scikit-learn\n')
+    samples_converted = convert_samples(samples)
+    score_max = 0
+    for i in range(starts):
+        x_train, y_train, x_test, y_test = create_x_y_train_test_samples(samples_converted, factor, train_test_split, k)
+        clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(15,), random_state=1)
+        clf.fit(x_train, y_train)
+        score_max = max(score_max, clf.score(x_test, y_test))
+    return score_max
diff --git a/sweetpea/_internal/main.py b/sweetpea/_internal/main.py
@@ -3,6 +3,8 @@
 __all__ = [
     'synthesize_trials', 'sample_mismatch_experiment',
 
+    'auto_correlation_scores_sample_within', 'auto_correlation_scores_samples_between',
+
     'print_experiments', 'tabulate_experiments',
     'save_experiments_csv', 'experiments_to_tuples',
 
@@ -51,6 +53,9 @@
 from sweetpea._internal.core.cnf import Var
 from sweetpea._internal.argcheck import argcheck, make_islistof
 
+from sweetpea._internal.auto_correlation_score import (auto_correlation_score_factor_within,
+                                                       auto_correlation_score_factor_between)
+
 
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 # ~~~~~~~~~~~~~ Top-Level functions ~~~~~~~~~~~~~~~~~~~~~
@@ -362,6 +367,73 @@ def sample_mismatch_experiment(block: Block, sample: dict) -> dict:
     return res
 
 
+def auto_correlation_scores_samples_between(samples: list, factor_names: List[str] = [],
+                                            number_trials: int = 10, starts: int = 10) -> dict:
+    """Given a number of samples given as :class:`list` of trial sets, calculates
+    a auto correlation score representing if a level can be predicted from the k
+    proceeding levels. This is done by creating a neural network that is trained on
+    predicting a factor based on the levels in all factors of the preceding trials.
+    The number of preceding trials taken into account is the minimum between number_trials
+    and half the sequence length.
+
+
+    :param samples:
+        A :class:`list` of trial sets. Each set is represented as a :class:`dictionary <dict>`
+        mapping each factor name to a list of levels, where each such list contains
+        to one level per trial.
+    :param factor_names:
+        A :class`list` of string. The factors to be tested (if None, all factors in samples are tested)
+    :param number_trials:
+        A :class int that indicates how many trials before the predicted trial to use for the prediction
+    :param starts:
+        A :class int that indicates how many times a new neural network is created. The final score is the
+        max prediction score of theses networks.
+    :returns:
+        A :class:`dict` describing the auto correlation of each factor.
+    """
+    res = {}
+    if not factor_names:
+        for f in samples[0].keys():
+            res[f] = auto_correlation_score_factor_between(samples, f, k=number_trials, starts=starts)
+    else:
+        for f in factor_names:
+            res[f] = auto_correlation_score_factor_between(samples, f, k=number_trials, starts=starts)
+    return res
+
+
+def auto_correlation_scores_sample_within(sample: dict, factor_names: List[str] = [],
+                                          number_trials: int = 10, starts: int = 10) -> dict:
+    """Given a samples given as :class:`dict` of a trial set, calculates
+    a auto correlation score representing if a level can be predicted from the k
+    proceeding levels. This is done by creating a neural network that is trained on
+    predicting a factor based on the levels in all factors of the preceding trials.
+    The number of preceding trials taken into account is the minimum between number_trials
+    and half the sequence length.
+
+
+    :param sample:
+        A :class:`dict` mapping each factor name to a list of levels, where each such list contains
+        to one level per trial.
+    :param factor_names:
+        A :class`list` of string. The factors to be tested (if None, all factors in samples are tested)
+    :param number_trials:
+        A :class int that indicates how many trials before the predicted trial to use for the prediction
+    :param starts:
+        A :class int that indicates how many times a new neural network is created. The final score is the
+        max prediction score of theses networks.
+    :returns:
+        A :class:`dict` describing the auto correlation of each factor.
+    """
+    res = {}
+    if not factor_names:
+        for f in sample.keys():
+            res[f] = auto_correlation_score_factor_within(sample, f, k=number_trials, starts=starts)
+    else:
+        for f in factor_names:
+            res[f] = auto_correlation_score_factor_within(sample, f, k=number_trials, starts=starts)
+    return res
+
+
 # TODO: This function isn't called anywhere, so it should be removed.
 def save_cnf(block: Block, filename: str):
     """Generates a CNF formula from a :class:`.Block` and then writes that CNF