Skip to content

Commit

Permalink
Score auto correlation
Browse files Browse the repository at this point in the history
Calculate autocorrelation based on the predictability of a factor level based on the n preceding factors
  • Loading branch information
younesStrittmatter authored May 3, 2023
1 parent 17cc6ae commit 2d2bb85
Show file tree
Hide file tree
Showing 3 changed files with 257 additions and 0 deletions.
42 changes: 42 additions & 0 deletions acceptance/test_auto_correlation_score.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from sweetpea import *

samples = [
{
'color': ['red', 'green', 'red', 'green', 'red', 'green'],
'word': ['red', 'green', 'red', 'green', 'red', 'red'],
'congruency': ['con', 'con', 'inc', 'con', 'inc', 'con']
},
{
'color': ['red', 'green', 'red', 'green', 'red', 'green'],
'word': ['red', 'red', 'green', 'red', 'red', 'green'],
'congruency': ['con', 'con', 'con', 'inc', 'con', 'con']
},
{
'color': ['green', 'red', 'green', 'red', 'green', 'red'],
'word': ['green', 'red', 'red', 'red', 'green', 'red'],
'congruency': ['con', 'inc', 'con', 'con', 'con', 'inc']
}]


def test_score_auto_correlation_all():
# this is to prevent testing when sklearn is not installed:
try:
from sklearn.neural_network import MLPClassifier
except ImportError as e:
assert True
return
res = auto_correlation_scores_samples_between(samples)
assert 'color' in res.keys() and 'word' in res.keys() and 'congruency' in res.keys()


def test_score_auto_correlation():
# this is to prevent testing when sklearn is not installed:
try:
from sklearn.neural_network import MLPClassifier
except ImportError as e:
assert True
return
res = auto_correlation_scores_samples_between(samples, ['color'])
assert 'color' in res.keys() and 'word' not in res.keys() and 'congruency' not in res.keys()


143 changes: 143 additions & 0 deletions sweetpea/_internal/auto_correlation_score.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
import random


def _get_level_to_float_mapping(factor, sample) -> dict:
item = set()
for l in sample[factor]:
item.add(l)
item_ = sorted(item)
return {item_[i]: float(i) for i in range(len(item_))}


def _convert_sample_levels_to_floats(factor_dict, sample) -> dict:
s_ = {}
for k in sample.keys():
s_[k] = [factor_dict[k][l] for l in sample[k]]
return s_


def convert_samples(samples: list) -> list:
"""Convert string levels in a sample to numbers"""
factors = set()
# get factors
for s in samples:
for f in s.keys():
factors.add(f)
factor_dict = {}
# get the level dict for every factor:
for s in samples:
for f in factors:
factor_dict[f] = _get_level_to_float_mapping(f, s)
res = []
for s in samples:
s_ = _convert_sample_levels_to_floats(factor_dict, s)
res.append(s_)
return res


def convert_sample(sample: dict) -> dict:
factor_dict = {}
for f in sample.keys():
factor_dict[f] = _get_level_to_float_mapping(f, sample)
return _convert_sample_levels_to_floats(factor_dict, sample)


def train_test_split_samples(samples: list, percentage: float = .8) -> tuple:
"""split list of samples in train and test samples"""
samples_ = samples.copy()
random.shuffle(samples_)
split = int(percentage * len(samples_))
if split == 0 or split == len(samples_):
raise Exception('Train or test set empty')
return samples_[:split], samples_[split:]


def create_x_y_sample(sample: dict, y_factor: str, k: int = 10) -> tuple:
"""create the independent and dependent values in a sample"""
x_lists = []
y_list = sample[y_factor]
for key in sample.keys():
x_lists.append(sample[key])
k_ = min(len(y_list) // 2, k)
start = 0
end = k_
x_res = []
y_res = []
if len(y_list) <= k_:
raise Exception('predict distance to high in auto correlation test')
while end < len(y_list):
x_temp = []
for x in x_lists:
x_temp += x[start: end]
x_res.append(x_temp)
y_res.append(y_list[end])
start += 1
end += 1
return x_res, y_res


def create_x_y_train_test_samples(samples: list, factor: str, percentage: float = .8, k: int = 10) -> tuple:
"""create a list of train independent, train dependent, test independent and test dpendent variables"""
train_set, test_set = train_test_split_samples(samples, percentage)
x_train = []
y_train = []
x_test = []
y_test = []
for s in train_set:
x_train_, y_train_ = create_x_y_sample(s, factor, k)
x_train += x_train_
y_train += y_train_
for s in test_set:
x_test_, y_test_ = create_x_y_sample(s, factor, k)
x_test += x_test_
y_test += y_test_
return x_train, y_train, x_test, y_test


def create_x_y_train_test_sample(sample: dict, factor: str, train_test_split: float = .8) -> tuple:
x_set, y_set = create_x_y_sample(sample, factor)
zipped = list(zip(x_set, y_set))
random.shuffle(zipped)
split = int(train_test_split * len(zipped))
if split == 0 or split == len(zipped):
raise Exception('Train or test set empty')
return [e[0] for e in zipped[:split]], \
[e[1] for e in zipped[:split]], \
[e[0] for e in zipped[split:]], \
[e[1] for e in zipped[split:]]


def auto_correlation_score_factor_within(sample: dict, factor: str, train_test_split: float = .8,
k: int = 10, starts: int = 10) -> float:
"""get the auto correlation score for a single factor for a samples (within)"""
try:
from sklearn.neural_network import MLPClassifier
except ImportError as e:
raise Exception(
'To use a auto correlation test, please install the scikit-learn package: pip install scikit-learn\n')
sample_converted = convert_sample(sample)
score_max = 0
for i in range(starts):
x_train, y_train, x_test, y_test = create_x_y_train_test_sample(sample_converted, factor, train_test_split)
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(15,), random_state=1)
clf.fit(x_train, y_train)
score_max = max(score_max, clf.score(x_test, y_test))
return score_max


def auto_correlation_score_factor_between(samples: list, factor: str, train_test_split: float = .8,
k: int = 10, starts: int = 10) -> float:
"""get the auto correlation score for a single factor for a list of samples (between)"""
try:
from sklearn.neural_network import MLPClassifier
except ImportError as e:
raise Exception(
'To use a auto correlation test, please install the scikit-learn package: pip install scikit-learn\n')
samples_converted = convert_samples(samples)
score_max = 0
for i in range(starts):
x_train, y_train, x_test, y_test = create_x_y_train_test_samples(samples_converted, factor, train_test_split, k)
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(15,), random_state=1)
clf.fit(x_train, y_train)
score_max = max(score_max, clf.score(x_test, y_test))
return score_max
72 changes: 72 additions & 0 deletions sweetpea/_internal/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
__all__ = [
'synthesize_trials', 'sample_mismatch_experiment',

'auto_correlation_scores_sample_within', 'auto_correlation_scores_samples_between',

'print_experiments', 'tabulate_experiments',
'save_experiments_csv', 'experiments_to_tuples',

Expand Down Expand Up @@ -51,6 +53,9 @@
from sweetpea._internal.core.cnf import Var
from sweetpea._internal.argcheck import argcheck, make_islistof

from sweetpea._internal.auto_correlation_score import (auto_correlation_score_factor_within,
auto_correlation_score_factor_between)


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~~~~~~~~~~~~~ Top-Level functions ~~~~~~~~~~~~~~~~~~~~~
Expand Down Expand Up @@ -362,6 +367,73 @@ def sample_mismatch_experiment(block: Block, sample: dict) -> dict:
return res


def auto_correlation_scores_samples_between(samples: list, factor_names: List[str] = [],
number_trials: int = 10, starts: int = 10) -> dict:
"""Given a number of samples given as :class:`list` of trial sets, calculates
a auto correlation score representing if a level can be predicted from the k
proceeding levels. This is done by creating a neural network that is trained on
predicting a factor based on the levels in all factors of the preceding trials.
The number of preceding trials taken into account is the minimum between number_trials
and half the sequence length.
:param samples:
A :class:`list` of trial sets. Each set is represented as a :class:`dictionary <dict>`
mapping each factor name to a list of levels, where each such list contains
to one level per trial.
:param factor_names:
A :class`list` of string. The factors to be tested (if None, all factors in samples are tested)
:param number_trials:
A :class int that indicates how many trials before the predicted trial to use for the prediction
:param starts:
A :class int that indicates how many times a new neural network is created. The final score is the
max prediction score of theses networks.
:returns:
A :class:`dict` describing the auto correlation of each factor.
"""
res = {}
if not factor_names:
for f in samples[0].keys():
res[f] = auto_correlation_score_factor_between(samples, f, k=number_trials, starts=starts)
else:
for f in factor_names:
res[f] = auto_correlation_score_factor_between(samples, f, k=number_trials, starts=starts)
return res


def auto_correlation_scores_sample_within(sample: dict, factor_names: List[str] = [],
number_trials: int = 10, starts: int = 10) -> dict:
"""Given a samples given as :class:`dict` of a trial set, calculates
a auto correlation score representing if a level can be predicted from the k
proceeding levels. This is done by creating a neural network that is trained on
predicting a factor based on the levels in all factors of the preceding trials.
The number of preceding trials taken into account is the minimum between number_trials
and half the sequence length.
:param sample:
A :class:`dict` mapping each factor name to a list of levels, where each such list contains
to one level per trial.
:param factor_names:
A :class`list` of string. The factors to be tested (if None, all factors in samples are tested)
:param number_trials:
A :class int that indicates how many trials before the predicted trial to use for the prediction
:param starts:
A :class int that indicates how many times a new neural network is created. The final score is the
max prediction score of theses networks.
:returns:
A :class:`dict` describing the auto correlation of each factor.
"""
res = {}
if not factor_names:
for f in sample.keys():
res[f] = auto_correlation_score_factor_within(sample, f, k=number_trials, starts=starts)
else:
for f in factor_names:
res[f] = auto_correlation_score_factor_within(sample, f, k=number_trials, starts=starts)
return res


# TODO: This function isn't called anywhere, so it should be removed.
def save_cnf(block: Block, filename: str):
"""Generates a CNF formula from a :class:`.Block` and then writes that CNF
Expand Down

0 comments on commit 2d2bb85

Please sign in to comment.