-
Notifications
You must be signed in to change notification settings - Fork 1
/
hangman.py
198 lines (166 loc) · 6.9 KB
/
hangman.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
from collections import defaultdict, namedtuple
import csv
import jinja2
import logging
import math
import os
import random
logging.basicConfig(level=logging.DEBUG)
sowpods = set(word.decode('utf-8').strip().lower() for word in open('sowpods.txt'))
word_ranks = defaultdict(int)
culled = 0
for word, count in csv.reader(open('50knouns.txt'), delimiter='\t'):
if word not in sowpods:
culled += 1
else:
word_ranks[word.decode('utf-8').strip().lower()] += int(count)
logging.info("Culled %d nonwords", culled)
words = [word for word, count in sorted(word_ranks.iteritems(), key=lambda (w,c): c, reverse=True)]
alphabet = set("abcdefghijklmnopqrstuvwxyz")
def separate_by_length(words):
words_by_length = defaultdict(list)
for word in words:
words_by_length[len(word)].append(word)
return words_by_length
def make_pattern(word, ch):
return u''.join(word[i] if word[i] == ch or word[i] < 'a' or word[i] > 'z' else u'_' for i in range(len(word)))
def combine_patterns(first, second):
return u''.join(b if a == u'_' else a for a, b in zip(first, second))
def group_by_patterns(words, ch):
groups = defaultdict(list)
for word in words:
groups[make_pattern(word, ch)].append(word)
return groups
Node = namedtuple('Node', ['guess', 'value'])
Entry = namedtuple('Entry', ['pattern', 'is_leaf', 'value'])
Section = namedtuple('Section', ['id', 'pattern', 'wrong', 'guess', 'entries'])
def empty_pattern(pattern):
return u'_' * len(pattern)
def score_grouping(subgroups, pattern):
wrong_words = subgroups.get(empty_pattern(pattern), ())
return (
len(subgroups) == 1, # Avoid meaningless guesses
len(wrong_words), # Minimise number of wrong words
sum(word_ranks[word] for word in wrong_words), # Avoid popular wrong words
-len(subgroups), # Maximise branching factor
)
def build_graph(pattern, words, letters, wrong=0):
if wrong == 6:
return Node('', None), words
elif len(words) == 1:
return Node('', words[0]), []
best_guess = None
for ch in letters:
subgroups = group_by_patterns(words, ch)
guess = (
score_grouping(subgroups, pattern),
ch,
subgroups)
if not best_guess or guess < best_guess:
best_guess = guess
score, ch, subgroups = best_guess
subnodes = {}
total_pruned = []
for subpattern, subwords in subgroups.iteritems():
new_pattern = combine_patterns(pattern, subpattern)
subnodes[new_pattern], pruned = build_graph(
new_pattern,
subwords,
letters - set([ch]),
wrong + 1 if new_pattern == pattern else wrong)
total_pruned.extend(pruned)
# Due to pruning, occasionally you end up with a subtree
# with only one usable word in it. Lift it back up.
if len(subnodes) == 2:
(pattern1, node1), (pattern2, node2) = subnodes.items()
if pattern1 == pattern and node1.value is None:
return node2, total_pruned
if pattern2 == pattern and node2.value is None:
return node1, total_pruned
return Node(ch, subnodes), total_pruned
def build_complete_graph(words):
graph = {}
total_pruned = []
for length, words in separate_by_length(words).iteritems():
pattern = u'_' * length
graph[pattern], pruned = build_graph(pattern, words, alphabet)
total_pruned.extend(pruned)
return graph, total_pruned
def augment_graph(graph, words):
"""Adds any words to the graph that can be added without inserting new sections."""
added = []
for word in words:
pattern = empty_pattern(word)
node = graph[pattern]
while node.guess != '':
pattern = combine_patterns(pattern, make_pattern(word, node.guess))
if pattern not in node.value:
# We can add this word without creating a new section
node.value[pattern] = Node('', word)
added.append(word)
break
node = node.value[pattern]
return added
def get_entry_value(graph, node_map, subnode_key):
if not subnode_key:
return True, None
elif graph[subnode_key].guess == '':
if not graph[subnode_key].value:
# Player wins
return True, None
else:
# We make a guess
return True, graph[subnode_key].value[0]
else:
return False, node_map[subnode_key]
def produce_book_data(pattern, node, start_id=1, wrong=0):
entries = []
subsections = []
for subpattern, subnode in sorted(node.value.iteritems(), key=lambda (k, v): (k!=pattern, k)):
if subnode.guess == '':
entries.append(Entry(subpattern, True, subnode.value))
else:
subsection_id = start_id + len(subsections) + 1
entries.append(Entry(subpattern, False, subsection_id))
if pattern == subpattern:
subsections.extend(produce_book_data(subpattern, subnode, subsection_id, wrong + 1))
else:
subsections.extend(produce_book_data(subpattern, subnode, subsection_id, wrong))
return [Section(start_id, pattern, wrong, node.guess, entries)] + subsections
def produce_complete_book_data(graph):
sections = []
lengths = []
for pattern, node in sorted(graph.iteritems()):
section_id = len(sections) + 1
lengths.append((len(pattern), section_id))
sections.extend(produce_book_data(pattern, node, section_id))
return sections, lengths
def write_wordlist(inwords, pruned, added):
words = set(inwords).difference(pruned).union(added)
out = open('hangman_wordlist.txt', 'w')
for word in sorted(words):
out.write(word + "\n")
out.close()
def book(count, extended_count):
env = jinja2.Environment(loader=jinja2.FileSystemLoader(os.getcwd()))
template = env.get_template("book.html")
inwords = words[:count]
graph, pruned = build_complete_graph(words[:count])
if pruned:
logging.warn("Pruned %d words due to too many wrong guesses: %r", len(pruned), pruned)
logging.warn("Pruned words are worth: %s", sum(word_ranks[word] for word in pruned))
added = augment_graph(graph, words[count:extended_count + count])
if added:
added.sort(key=lambda w:(len(w), w))
logging.info("Added %d words to existing sections", len(added))
write_wordlist(inwords, pruned, added)
sections, lengths = produce_complete_book_data(graph)
logging.info("Created %d sections", len(sections))
logging.info("lengths: %s", [(a, b, b*100.0/len(sections)) for (a,b) in lengths])
bookdata = template.render({
'lengths': lengths,
'sections': sections,
}).encode('utf-8')
print bookdata
if __name__ == '__main__':
book(4000, 12950)