Skip to content

Commit

Permalink
Refactoring: specify data folder at run-time via CLI
Browse files Browse the repository at this point in the history
  • Loading branch information
woctezuma committed Nov 14, 2022
1 parent f21e2ca commit ae3419e
Show file tree
Hide file tree
Showing 6 changed files with 137 additions and 71 deletions.
27 changes: 13 additions & 14 deletions simulations.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,14 @@
import numpy as np
from tqdm import tqdm

from src.file import get_simulation_results_folder
from src.pattern import (
get_pattern,
get_possible_words,
pattern_to_int_list,
patterns_to_string,
)
from src.prior import (
DATA_DIR,
get_frequency_based_priors,
get_true_wordle_prior,
get_word_list,
)
from src.prior import get_frequency_based_priors, get_true_wordle_prior, get_word_list
from src.solver import brute_force_optimal_guess, optimal_guess

GAME_NAMES = ["wordle", "dungleon"]
Expand All @@ -43,21 +39,22 @@ def simulate_games(
next_guess_map_file=None,
quiet=False,
):
all_words = get_word_list(short=False)
short_word_list = get_word_list(short=True)
all_words = get_word_list(game_name, short=False)
short_word_list = get_word_list(game_name, short=True)

if first_guess is None:
first_guess = optimal_guess(
all_words,
all_words,
priors,
game_name=game_name,
look_two_ahead=look_two_ahead,
purely_maximize_information=purely_maximize_information,
optimize_for_uniform_distribution=optimize_for_uniform_distribution,
)

if priors is None:
priors = get_frequency_based_priors()
priors = get_frequency_based_priors(game_name)

if test_set is None:
test_set = short_word_list
Expand All @@ -82,19 +79,21 @@ def get_next_guess(guesses, patterns, possibilities):
choices = all_words
if hard_mode:
for guess, pattern in zip(guesses, patterns):
choices = get_possible_words(guess, pattern, choices)
choices = get_possible_words(guess, pattern, choices, game_name)
if brute_force_optimize:
next_guess_map[phash] = brute_force_optimal_guess(
choices,
possibilities,
priors,
game_name=game_name,
n_top_picks=brute_force_depth,
)
else:
next_guess_map[phash] = optimal_guess(
choices,
possibilities,
priors,
game_name,
look_two_ahead=look_two_ahead,
purely_maximize_information=purely_maximize_information,
optimize_for_uniform_distribution=optimize_for_uniform_distribution,
Expand Down Expand Up @@ -123,10 +122,10 @@ def get_next_guess(guesses, patterns, possibilities):
score = 1
guess = first_guess
while guess != answer:
pattern = get_pattern(guess, answer)
pattern = get_pattern(guess, answer, game_name)
guesses.append(guess)
patterns.append(pattern)
possibilities = get_possible_words(guess, pattern, possibilities)
possibilities = get_possible_words(guess, pattern, possibilities, game_name)
possibility_counts.append(len(possibilities))
score += 1
guess = get_next_guess(guesses, patterns, possibilities)
Expand Down Expand Up @@ -187,7 +186,7 @@ def get_next_guess(guesses, patterns, possibilities):
(next_guess_map, next_guess_map_file),
):
if file:
path = os.path.join(DATA_DIR, "simulation_results", file)
path = os.path.join(get_simulation_results_folder(game_name), file)
with open(path, "w", encoding="utf8") as fp:
json.dump(obj, fp)

Expand All @@ -214,7 +213,7 @@ def get_next_guess(guesses, patterns, possibilities):
results, decision_map = simulate_games(
game_name=args.game_name,
first_guess=args.first_guess,
priors=get_true_wordle_prior(),
priors=get_true_wordle_prior(args.game_name),
optimize_for_uniform_distribution=True,
# shuffle=True,
# brute_force_optimize=True,
Expand Down
21 changes: 13 additions & 8 deletions src/entropy.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
# Functions associated with entropy calculation


def get_pattern_distributions(allowed_words, possible_words, weights):
def get_pattern_distributions(allowed_words, possible_words, weights, game_name):
"""
For each possible guess in allowed_words, this finds the probability
distribution across all the 3^5 wordle patterns you could see, assuming
Expand All @@ -17,7 +17,7 @@ def get_pattern_distributions(allowed_words, possible_words, weights):
that to bucket together words from possible_words which would produce
the same pattern, adding together their corresponding probabilities.
"""
pattern_matrix = get_pattern_matrix(allowed_words, possible_words)
pattern_matrix = get_pattern_matrix(allowed_words, possible_words, game_name)

n = len(allowed_words)
distributions = np.zeros((n, 3**5))
Expand All @@ -32,26 +32,31 @@ def entropy_of_distributions(distributions):
return entropy(distributions, base=2, axis=axis)


def get_entropies(allowed_words, possible_words, weights):
def get_entropies(allowed_words, possible_words, weights, game_name):
if weights.sum() == 0:
return np.zeros(len(allowed_words))
distributions = get_pattern_distributions(allowed_words, possible_words, weights)
distributions = get_pattern_distributions(
allowed_words,
possible_words,
weights,
game_name,
)
return entropy_of_distributions(distributions)


def get_bucket_sizes(allowed_words, possible_words):
def get_bucket_sizes(allowed_words, possible_words, game_name):
"""
Returns a (len(allowed_words), 243) shape array representing the size of
word buckets associated with each guess in allowed_words
"""
weights = np.ones(len(possible_words))
return get_pattern_distributions(allowed_words, possible_words, weights)
return get_pattern_distributions(allowed_words, possible_words, weights, game_name)


def get_bucket_counts(allowed_words, possible_words):
def get_bucket_counts(allowed_words, possible_words, game_name):
"""
Returns the number of separate buckets that each guess in allowed_words
would separate possible_words into
"""
bucket_sizes = get_bucket_sizes(allowed_words, possible_words)
bucket_sizes = get_bucket_sizes(allowed_words, possible_words, game_name)
return (bucket_sizes > 0).sum(1)
44 changes: 44 additions & 0 deletions src/file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import os

DATA_DIR = os.path.join(
os.path.dirname(os.path.realpath(__file__)),
"data",
)
SHORT_WORD_LIST_FILE = "possible_words.txt"
LONG_WORD_LIST_FILE = "allowed_words.txt"
WORD_FREQ_FILE = "wordle_words_freq_full.txt"
WORD_FREQ_MAP_FILE = "freq_map.json"
PATTERN_MATRIX_FILE = "pattern_matrix.npy"
SIMULATION_DIR = "simulation_results"


def get_data_dir(game_name):
return os.path.join(DATA_DIR, game_name)


def get_data_fname(game_name, file):
return os.path.join(get_data_dir(game_name), file)


def get_short_word_list_fname(game_name):
return get_data_fname(game_name, SHORT_WORD_LIST_FILE)


def get_long_word_list_fname(game_name):
return get_data_fname(game_name, LONG_WORD_LIST_FILE)


def get_word_freq_fname(game_name):
return get_data_fname(game_name, WORD_FREQ_FILE)


def get_word_freq_map_fname(game_name):
return get_data_fname(game_name, WORD_FREQ_MAP_FILE)


def get_pattern_matrix_fname(game_name):
return get_data_fname(game_name, PATTERN_MATRIX_FILE)


def get_simulation_results_folder(game_name):
return get_data_fname(game_name, SIMULATION_DIR)
34 changes: 17 additions & 17 deletions src/pattern.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,13 @@

import numpy as np

from src.prior import DATA_DIR, get_word_list
from src.file import get_pattern_matrix_fname
from src.prior import get_word_list

MISS = np.uint8(0)
MISPLACED = np.uint8(1)
EXACT = np.uint8(2)

PATTERN_MATRIX_FILE = os.path.join(DATA_DIR, "pattern_matrix.npy")

# To store the large grid of patterns at run time
PATTERN_GRID_DATA = {}

Expand Down Expand Up @@ -92,17 +91,18 @@ def generate_pattern_matrix(words1, words2):
return pattern_matrix


def generate_full_pattern_matrix():
words = get_word_list()
def generate_full_pattern_matrix(game_name):
words = get_word_list(game_name)
pattern_matrix = generate_pattern_matrix(words, words)
# Save to file
np.save(PATTERN_MATRIX_FILE, pattern_matrix)
np.save(get_pattern_matrix_fname(game_name), pattern_matrix)
return pattern_matrix


def get_pattern_matrix(words1, words2):
def get_pattern_matrix(words1, words2, game_name):
pattern_matrix_fname = get_pattern_matrix_fname(game_name)
if not PATTERN_GRID_DATA:
if not os.path.exists(PATTERN_MATRIX_FILE):
if not os.path.exists(pattern_matrix_fname):
logging.info(
"\n".join(
[
Expand All @@ -112,10 +112,10 @@ def get_pattern_matrix(words1, words2):
],
),
)
generate_full_pattern_matrix()
PATTERN_GRID_DATA["grid"] = np.load(PATTERN_MATRIX_FILE)
generate_full_pattern_matrix(game_name)
PATTERN_GRID_DATA["grid"] = np.load(pattern_matrix_fname)
PATTERN_GRID_DATA["words_to_index"] = dict(
zip(get_word_list(), itertools.count()),
zip(get_word_list(game_name), itertools.count()),
)

full_grid = PATTERN_GRID_DATA["grid"]
Expand All @@ -126,11 +126,11 @@ def get_pattern_matrix(words1, words2):
return full_grid[np.ix_(indices1, indices2)]


def get_pattern(guess, answer):
def get_pattern(guess, answer, game_name):
if PATTERN_GRID_DATA:
saved_words = PATTERN_GRID_DATA["words_to_index"]
if guess in saved_words and answer in saved_words:
return get_pattern_matrix([guess], [answer])[0, 0]
return get_pattern_matrix([guess], [answer], game_name)[0, 0]
return generate_pattern_matrix([guess], [answer])[0, 0]


Expand All @@ -152,14 +152,14 @@ def patterns_to_string(patterns):
return "\n".join(map(pattern_to_string, patterns))


def get_possible_words(guess, pattern, word_list):
all_patterns = get_pattern_matrix([guess], word_list).flatten()
def get_possible_words(guess, pattern, word_list, game_name):
all_patterns = get_pattern_matrix([guess], word_list, game_name).flatten()
return list(np.array(word_list)[all_patterns == pattern])


def get_word_buckets(guess, possible_words):
def get_word_buckets(guess, possible_words, game_name):
buckets = [[] for _x in range(3**5)]
hashes = get_pattern_matrix([guess], possible_words).flatten()
hashes = get_pattern_matrix([guess], possible_words, game_name).flatten()
for index, word in zip(hashes, possible_words):
buckets[index].append(word)
return buckets
42 changes: 22 additions & 20 deletions src/prior.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,46 +4,48 @@
import numpy as np
from scipy.special import expit as sigmoid

DATA_DIR = os.path.join(
os.path.dirname(os.path.realpath(__file__)),
"data",
from src.file import (
get_long_word_list_fname,
get_short_word_list_fname,
get_word_freq_fname,
get_word_freq_map_fname,
)
SHORT_WORD_LIST_FILE = os.path.join(DATA_DIR, "possible_words.txt")
LONG_WORD_LIST_FILE = os.path.join(DATA_DIR, "allowed_words.txt")
WORD_FREQ_FILE = os.path.join(DATA_DIR, "wordle_words_freq_full.txt")
WORD_FREQ_MAP_FILE = os.path.join(DATA_DIR, "freq_map.json")


# Reading from files


def get_word_list(short=False):
def get_word_list(game_name, short=False):
result = []
file = SHORT_WORD_LIST_FILE if short else LONG_WORD_LIST_FILE
file = (
get_short_word_list_fname(game_name)
if short
else get_long_word_list_fname(game_name)
)
with open(file, encoding="utf8") as fp:
result.extend([word.strip() for word in fp.readlines()])
return result


def get_word_frequencies(regenerate=False):
if os.path.exists(WORD_FREQ_MAP_FILE) or regenerate:
with open(WORD_FREQ_MAP_FILE, encoding="utf8") as fp:
def get_word_frequencies(game_name, regenerate=False):
word_freq_map_fname = get_word_freq_map_fname(game_name)
if os.path.exists(word_freq_map_fname) or regenerate:
with open(word_freq_map_fname, encoding="utf8") as fp:
result = json.load(fp)
return result
# Otherwise, regenerate
freq_map = {}
with open(WORD_FREQ_FILE, encoding="utf8") as fp:
with open(get_word_freq_fname(game_name), encoding="utf8") as fp:
for line in fp.readlines():
pieces = line.split(" ")
word = pieces[0]
freq = [float(piece.strip()) for piece in pieces[1:]]
freq_map[word] = np.mean(freq[-5:])
with open(WORD_FREQ_MAP_FILE, "w", encoding="utf8") as fp:
with open(word_freq_map_fname, "w", encoding="utf8") as fp:
json.dump(freq_map, fp)
return freq_map


def get_frequency_based_priors(n_common=3000, width_under_sigmoid=10):
def get_frequency_based_priors(game_name, n_common=3000, width_under_sigmoid=10):
"""
We know that that list of wordle answers was curated by some human
based on whether they're sufficiently common. This function aims
Expand All @@ -52,7 +54,7 @@ def get_frequency_based_priors(n_common=3000, width_under_sigmoid=10):
Sort the words by frequency, then apply a sigmoid along it.
"""
freq_map = get_word_frequencies()
freq_map = get_word_frequencies(game_name)
words = np.array(list(freq_map.keys()))
freq = np.array([freq_map[w] for w in words])
arg_sort = freq.argsort()
Expand All @@ -70,7 +72,7 @@ def get_frequency_based_priors(n_common=3000, width_under_sigmoid=10):
return priors


def get_true_wordle_prior():
words = get_word_list()
short_words = get_word_list(short=True)
def get_true_wordle_prior(game_name):
words = get_word_list(game_name)
short_words = get_word_list(game_name, short=True)
return {w: int(w in short_words) for w in words}
Loading

0 comments on commit ae3419e

Please sign in to comment.