Refactoring: specify data folder at run-time via CLI

woctezuma · Nov 14, 2022 · ae3419e · ae3419e
1 parent f21e2ca
commit ae3419e
Show file tree

Hide file tree

Showing 6 changed files with 137 additions and 71 deletions.
diff --git a/simulations.py b/simulations.py
@@ -6,18 +6,14 @@
 import numpy as np
 from tqdm import tqdm
 
+from src.file import get_simulation_results_folder
 from src.pattern import (
     get_pattern,
     get_possible_words,
     pattern_to_int_list,
     patterns_to_string,
 )
-from src.prior import (
-    DATA_DIR,
-    get_frequency_based_priors,
-    get_true_wordle_prior,
-    get_word_list,
-)
+from src.prior import get_frequency_based_priors, get_true_wordle_prior, get_word_list
 from src.solver import brute_force_optimal_guess, optimal_guess
 
 GAME_NAMES = ["wordle", "dungleon"]
@@ -43,21 +39,22 @@ def simulate_games(
     next_guess_map_file=None,
     quiet=False,
 ):
-    all_words = get_word_list(short=False)
-    short_word_list = get_word_list(short=True)
+    all_words = get_word_list(game_name, short=False)
+    short_word_list = get_word_list(game_name, short=True)
 
     if first_guess is None:
         first_guess = optimal_guess(
             all_words,
             all_words,
             priors,
+            game_name=game_name,
             look_two_ahead=look_two_ahead,
             purely_maximize_information=purely_maximize_information,
             optimize_for_uniform_distribution=optimize_for_uniform_distribution,
         )
 
     if priors is None:
-        priors = get_frequency_based_priors()
+        priors = get_frequency_based_priors(game_name)
 
     if test_set is None:
         test_set = short_word_list
@@ -82,19 +79,21 @@ def get_next_guess(guesses, patterns, possibilities):
             choices = all_words
             if hard_mode:
                 for guess, pattern in zip(guesses, patterns):
-                    choices = get_possible_words(guess, pattern, choices)
+                    choices = get_possible_words(guess, pattern, choices, game_name)
             if brute_force_optimize:
                 next_guess_map[phash] = brute_force_optimal_guess(
                     choices,
                     possibilities,
                     priors,
+                    game_name=game_name,
                     n_top_picks=brute_force_depth,
                 )
             else:
                 next_guess_map[phash] = optimal_guess(
                     choices,
                     possibilities,
                     priors,
+                    game_name,
                     look_two_ahead=look_two_ahead,
                     purely_maximize_information=purely_maximize_information,
                     optimize_for_uniform_distribution=optimize_for_uniform_distribution,
@@ -123,10 +122,10 @@ def get_next_guess(guesses, patterns, possibilities):
         score = 1
         guess = first_guess
         while guess != answer:
-            pattern = get_pattern(guess, answer)
+            pattern = get_pattern(guess, answer, game_name)
             guesses.append(guess)
             patterns.append(pattern)
-            possibilities = get_possible_words(guess, pattern, possibilities)
+            possibilities = get_possible_words(guess, pattern, possibilities, game_name)
             possibility_counts.append(len(possibilities))
             score += 1
             guess = get_next_guess(guesses, patterns, possibilities)
@@ -187,7 +186,7 @@ def get_next_guess(guesses, patterns, possibilities):
         (next_guess_map, next_guess_map_file),
     ):
         if file:
-            path = os.path.join(DATA_DIR, "simulation_results", file)
+            path = os.path.join(get_simulation_results_folder(game_name), file)
             with open(path, "w", encoding="utf8") as fp:
                 json.dump(obj, fp)
 
@@ -214,7 +213,7 @@ def get_next_guess(guesses, patterns, possibilities):
     results, decision_map = simulate_games(
         game_name=args.game_name,
         first_guess=args.first_guess,
-        priors=get_true_wordle_prior(),
+        priors=get_true_wordle_prior(args.game_name),
         optimize_for_uniform_distribution=True,
         # shuffle=True,
         # brute_force_optimize=True,

diff --git a/src/entropy.py b/src/entropy.py
@@ -6,7 +6,7 @@
 # Functions associated with entropy calculation
 
 
-def get_pattern_distributions(allowed_words, possible_words, weights):
+def get_pattern_distributions(allowed_words, possible_words, weights, game_name):
     """
     For each possible guess in allowed_words, this finds the probability
     distribution across all the 3^5 wordle patterns you could see, assuming
@@ -17,7 +17,7 @@ def get_pattern_distributions(allowed_words, possible_words, weights):
     that to bucket together words from possible_words which would produce
     the same pattern, adding together their corresponding probabilities.
     """
-    pattern_matrix = get_pattern_matrix(allowed_words, possible_words)
+    pattern_matrix = get_pattern_matrix(allowed_words, possible_words, game_name)
 
     n = len(allowed_words)
     distributions = np.zeros((n, 3**5))
@@ -32,26 +32,31 @@ def entropy_of_distributions(distributions):
     return entropy(distributions, base=2, axis=axis)
 
 
-def get_entropies(allowed_words, possible_words, weights):
+def get_entropies(allowed_words, possible_words, weights, game_name):
     if weights.sum() == 0:
         return np.zeros(len(allowed_words))
-    distributions = get_pattern_distributions(allowed_words, possible_words, weights)
+    distributions = get_pattern_distributions(
+        allowed_words,
+        possible_words,
+        weights,
+        game_name,
+    )
     return entropy_of_distributions(distributions)
 
 
-def get_bucket_sizes(allowed_words, possible_words):
+def get_bucket_sizes(allowed_words, possible_words, game_name):
     """
     Returns a (len(allowed_words), 243) shape array representing the size of
     word buckets associated with each guess in allowed_words
     """
     weights = np.ones(len(possible_words))
-    return get_pattern_distributions(allowed_words, possible_words, weights)
+    return get_pattern_distributions(allowed_words, possible_words, weights, game_name)
 
 
-def get_bucket_counts(allowed_words, possible_words):
+def get_bucket_counts(allowed_words, possible_words, game_name):
     """
     Returns the number of separate buckets that each guess in allowed_words
     would separate possible_words into
     """
-    bucket_sizes = get_bucket_sizes(allowed_words, possible_words)
+    bucket_sizes = get_bucket_sizes(allowed_words, possible_words, game_name)
     return (bucket_sizes > 0).sum(1)
diff --git a/src/file.py b/src/file.py
@@ -0,0 +1,44 @@
+import os
+
+DATA_DIR = os.path.join(
+    os.path.dirname(os.path.realpath(__file__)),
+    "data",
+)
+SHORT_WORD_LIST_FILE = "possible_words.txt"
+LONG_WORD_LIST_FILE = "allowed_words.txt"
+WORD_FREQ_FILE = "wordle_words_freq_full.txt"
+WORD_FREQ_MAP_FILE = "freq_map.json"
+PATTERN_MATRIX_FILE = "pattern_matrix.npy"
+SIMULATION_DIR = "simulation_results"
+
+
+def get_data_dir(game_name):
+    return os.path.join(DATA_DIR, game_name)
+
+
+def get_data_fname(game_name, file):
+    return os.path.join(get_data_dir(game_name), file)
+
+
+def get_short_word_list_fname(game_name):
+    return get_data_fname(game_name, SHORT_WORD_LIST_FILE)
+
+
+def get_long_word_list_fname(game_name):
+    return get_data_fname(game_name, LONG_WORD_LIST_FILE)
+
+
+def get_word_freq_fname(game_name):
+    return get_data_fname(game_name, WORD_FREQ_FILE)
+
+
+def get_word_freq_map_fname(game_name):
+    return get_data_fname(game_name, WORD_FREQ_MAP_FILE)
+
+
+def get_pattern_matrix_fname(game_name):
+    return get_data_fname(game_name, PATTERN_MATRIX_FILE)
+
+
+def get_simulation_results_folder(game_name):
+    return get_data_fname(game_name, SIMULATION_DIR)
diff --git a/src/pattern.py b/src/pattern.py
@@ -4,14 +4,13 @@
 
 import numpy as np
 
-from src.prior import DATA_DIR, get_word_list
+from src.file import get_pattern_matrix_fname
+from src.prior import get_word_list
 
 MISS = np.uint8(0)
 MISPLACED = np.uint8(1)
 EXACT = np.uint8(2)
 
-PATTERN_MATRIX_FILE = os.path.join(DATA_DIR, "pattern_matrix.npy")
-
 # To store the large grid of patterns at run time
 PATTERN_GRID_DATA = {}
 
@@ -92,17 +91,18 @@ def generate_pattern_matrix(words1, words2):
     return pattern_matrix
 
 
-def generate_full_pattern_matrix():
-    words = get_word_list()
+def generate_full_pattern_matrix(game_name):
+    words = get_word_list(game_name)
     pattern_matrix = generate_pattern_matrix(words, words)
     # Save to file
-    np.save(PATTERN_MATRIX_FILE, pattern_matrix)
+    np.save(get_pattern_matrix_fname(game_name), pattern_matrix)
     return pattern_matrix
 
 
-def get_pattern_matrix(words1, words2):
+def get_pattern_matrix(words1, words2, game_name):
+    pattern_matrix_fname = get_pattern_matrix_fname(game_name)
     if not PATTERN_GRID_DATA:
-        if not os.path.exists(PATTERN_MATRIX_FILE):
+        if not os.path.exists(pattern_matrix_fname):
             logging.info(
                 "\n".join(
                     [
@@ -112,10 +112,10 @@ def get_pattern_matrix(words1, words2):
                     ],
                 ),
             )
-            generate_full_pattern_matrix()
-        PATTERN_GRID_DATA["grid"] = np.load(PATTERN_MATRIX_FILE)
+            generate_full_pattern_matrix(game_name)
+        PATTERN_GRID_DATA["grid"] = np.load(pattern_matrix_fname)
         PATTERN_GRID_DATA["words_to_index"] = dict(
-            zip(get_word_list(), itertools.count()),
+            zip(get_word_list(game_name), itertools.count()),
         )
 
     full_grid = PATTERN_GRID_DATA["grid"]
@@ -126,11 +126,11 @@ def get_pattern_matrix(words1, words2):
     return full_grid[np.ix_(indices1, indices2)]
 
 
-def get_pattern(guess, answer):
+def get_pattern(guess, answer, game_name):
     if PATTERN_GRID_DATA:
         saved_words = PATTERN_GRID_DATA["words_to_index"]
         if guess in saved_words and answer in saved_words:
-            return get_pattern_matrix([guess], [answer])[0, 0]
+            return get_pattern_matrix([guess], [answer], game_name)[0, 0]
     return generate_pattern_matrix([guess], [answer])[0, 0]
 
 
@@ -152,14 +152,14 @@ def patterns_to_string(patterns):
     return "\n".join(map(pattern_to_string, patterns))
 
 
-def get_possible_words(guess, pattern, word_list):
-    all_patterns = get_pattern_matrix([guess], word_list).flatten()
+def get_possible_words(guess, pattern, word_list, game_name):
+    all_patterns = get_pattern_matrix([guess], word_list, game_name).flatten()
     return list(np.array(word_list)[all_patterns == pattern])
 
 
-def get_word_buckets(guess, possible_words):
+def get_word_buckets(guess, possible_words, game_name):
     buckets = [[] for _x in range(3**5)]
-    hashes = get_pattern_matrix([guess], possible_words).flatten()
+    hashes = get_pattern_matrix([guess], possible_words, game_name).flatten()
     for index, word in zip(hashes, possible_words):
         buckets[index].append(word)
     return buckets
diff --git a/src/prior.py b/src/prior.py
@@ -4,46 +4,48 @@
 import numpy as np
 from scipy.special import expit as sigmoid
 
-DATA_DIR = os.path.join(
-    os.path.dirname(os.path.realpath(__file__)),
-    "data",
+from src.file import (
+    get_long_word_list_fname,
+    get_short_word_list_fname,
+    get_word_freq_fname,
+    get_word_freq_map_fname,
 )
-SHORT_WORD_LIST_FILE = os.path.join(DATA_DIR, "possible_words.txt")
-LONG_WORD_LIST_FILE = os.path.join(DATA_DIR, "allowed_words.txt")
-WORD_FREQ_FILE = os.path.join(DATA_DIR, "wordle_words_freq_full.txt")
-WORD_FREQ_MAP_FILE = os.path.join(DATA_DIR, "freq_map.json")
-
 
 # Reading from files
 
 
-def get_word_list(short=False):
+def get_word_list(game_name, short=False):
     result = []
-    file = SHORT_WORD_LIST_FILE if short else LONG_WORD_LIST_FILE
+    file = (
+        get_short_word_list_fname(game_name)
+        if short
+        else get_long_word_list_fname(game_name)
+    )
     with open(file, encoding="utf8") as fp:
         result.extend([word.strip() for word in fp.readlines()])
     return result
 
 
-def get_word_frequencies(regenerate=False):
-    if os.path.exists(WORD_FREQ_MAP_FILE) or regenerate:
-        with open(WORD_FREQ_MAP_FILE, encoding="utf8") as fp:
+def get_word_frequencies(game_name, regenerate=False):
+    word_freq_map_fname = get_word_freq_map_fname(game_name)
+    if os.path.exists(word_freq_map_fname) or regenerate:
+        with open(word_freq_map_fname, encoding="utf8") as fp:
             result = json.load(fp)
         return result
     # Otherwise, regenerate
     freq_map = {}
-    with open(WORD_FREQ_FILE, encoding="utf8") as fp:
+    with open(get_word_freq_fname(game_name), encoding="utf8") as fp:
         for line in fp.readlines():
             pieces = line.split(" ")
             word = pieces[0]
             freq = [float(piece.strip()) for piece in pieces[1:]]
             freq_map[word] = np.mean(freq[-5:])
-    with open(WORD_FREQ_MAP_FILE, "w", encoding="utf8") as fp:
+    with open(word_freq_map_fname, "w", encoding="utf8") as fp:
         json.dump(freq_map, fp)
     return freq_map
 
 
-def get_frequency_based_priors(n_common=3000, width_under_sigmoid=10):
+def get_frequency_based_priors(game_name, n_common=3000, width_under_sigmoid=10):
     """
     We know that that list of wordle answers was curated by some human
     based on whether they're sufficiently common. This function aims
@@ -52,7 +54,7 @@ def get_frequency_based_priors(n_common=3000, width_under_sigmoid=10):
 
     Sort the words by frequency, then apply a sigmoid along it.
     """
-    freq_map = get_word_frequencies()
+    freq_map = get_word_frequencies(game_name)
     words = np.array(list(freq_map.keys()))
     freq = np.array([freq_map[w] for w in words])
     arg_sort = freq.argsort()
@@ -70,7 +72,7 @@ def get_frequency_based_priors(n_common=3000, width_under_sigmoid=10):
     return priors
 
 
-def get_true_wordle_prior():
-    words = get_word_list()
-    short_words = get_word_list(short=True)
+def get_true_wordle_prior(game_name):
+    words = get_word_list(game_name)
+    short_words = get_word_list(game_name, short=True)
     return {w: int(w in short_words) for w in words}