forked from langcog/standard-model
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodel.R
89 lines (72 loc) · 4.12 KB
/
model.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
sigmoid <- function(x) {
return( 1 / (1 + exp(-x)) )
}
#library(profvis)
#profvis({
make_long_df <- function(df, n_learners, max_age) {
#df = cbind(rep(zeroval,n_learners), df) # value at 0 mos
df = data.frame(df)
names(df) = 1:max_age
df$id = 1:nrow(df)
df_l = gather(df, "month", "words", 1:max_age)
df_l$month = as.numeric(as.character(df_l$month))
return(df_l)
}
# eventually: what happens when you have a larger vocab (60k) but are only testing 500 (CDI sim)
# test for acceleration across all param values
# consolidate learning rate and threshold ?
# variability in # effective instances to learn
# load generated tokens
# need to add proc_speed term and proc_facilitates interaction term
simulate <- function(vocab_size=10000, distro, input_rate, n_learners=100, threshold, max_age=48, mean_learning_rate, threshold_sd, proc_facilitates, proc_speed_dev) {
#probs = wf_distros[[distro]] # "zipf" or "uniform"
if(distro=="uniform") {
probs = rep(1/vocab_size, vocab_size)
} else {
probs = 1 / (1:vocab_size + 2.7) # f(r) = 1 / (r+beta)^alpha, alpha=1, beta=2.7 (Mandelbrot, 1953, 1962)
probs = probs/sum(probs)
probs = sample(probs, length(probs))
}
# maybe learning_rate should be beta-distributed?
learning_rate = rnorm(n_learners, mean=mean_learning_rate, sd=1) # individual learning rates
learning_rate[which(learning_rate<0)] = 0.1 # >0
# fixed threshold for all words, or normal distribution over word difficulty (like McMurray 2007)
#if(threshold_varies) threshold = rnorm(vocab_size, mean=threshold, sd=20)
threshold = rnorm(vocab_size, mean=threshold, sd=threshold_sd)
cumulative_word_occs = matrix(0, nrow=n_learners, ncol=vocab_size) # number of times each word has appeared per learner
known_words = matrix(0, nrow=n_learners, ncol=max_age) # known words per individual (row) per month (col)
proc_speed = matrix(0, nrow=n_learners, ncol=max_age) # do we want per word instead of per learner?
# processing speed
# Kail (1991): RT slopes follow an exponential, such that Y(i) = a + b*exp(−c*i), where Y=predicted var, i=age.
a = 0.56 # eventual (adult) asymptote
b = 1.04 # multiplier for the (infant) intercept
c = proc_speed_dev #rate of development (0.72 in Frank, Lewis, & MacDonald, 2016)
tokens_per_mo = round(input_rate*30.42)*n_learners # tokens/day * days/month
# (should be: tokens/hour * waking hours/day * days/month)
tokens = sample(1:vocab_size, tokens_per_mo*48, prob=probs, replace=T)
for(t in 1:max_age) {
# sample 1 month of tokens for all learners
#tokens_mo = matrix(sample(1:vocab_size, tokens_per_mo, prob=probs, replace=T), nrow=n_learners) # 3340ms
tok_i = (t-1)*tokens_per_mo + 1 # for t=1, 1:tokens_per_mo, for t=2, tokens_per_mo
#tokens_mo = matrix(tokens[[distro]][tok_i:(t*tokens_per_mo)], nrow=n_learners) # if using generate_tokens.R
tokens_mo = matrix(tokens[tok_i:(t*tokens_per_mo)], nrow=n_learners)
mo_word_occs = apply(tokens_mo, 1, tabulate, nbins=vocab_size) # for sampling version (1020ms)
proc_speed[,t] = a + rowSums(b*exp(-c*log(cumulative_word_occs+1))) / vocab_size # 930ms
# learning rate scales value of occurrences
if(proc_facilitates) { # further scale value of word occurrences by processing speed
cumulative_word_occs = cumulative_word_occs + (3-proc_speed[,t])*learning_rate*t(mo_word_occs) # 380ms
} else {
cumulative_word_occs = cumulative_word_occs + learning_rate*t(mo_word_occs) # 360ms accumulate occurrences this month
}
known_words[,t] = rowSums(cumulative_word_occs>threshold)
}
# return known words and mean RT per individual per month
# reshape to long
known_words_l = make_long_df(known_words, n_learners, max_age)
proc_speed_l = make_long_df(proc_speed, n_learners, max_age)
return(list(known_words=known_words_l, proc_speed=proc_speed_l))
}
# vocab_size, distro, input_rate, n_learners, threshold, max_age, mean_learning_rate, threshold_sd, proc_facilitates, proc_speed_dev)
#sim = simulate(10000, "uniform", 1000, 100, 100, 48, 2, 10, F, .72)
#sim$known_words %>% group_by(month) %>% summarise(mean=mean(words), sd=sd(words))
#}) # profiler