-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate.py
executable file
·115 lines (87 loc) · 2.93 KB
/
generate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/python
import sys, string, re, pprint, random
sys.setrecursionlimit(20000)
#rules are of the format
#LHS : [(RHS1, w1), (RHS2, w2) ...]
rules = {}
total_weights = {}
def parse_grammar_file(filename):
global rules
global total_weights
f = open(filename, 'r')
for line in f:
if (line[0] != "#" and (not re.match(line.strip(), '\s'))):
#skip empty lines
#skip comments
line = line.split("#")[0] #remove comments
#print "###" + line
split = line.strip().split()
weight = float(split[0])
LHS = split[1]
RHS = split[2:]
#rules[LHS] = RHS
if LHS in rules:
rules[LHS].append((RHS, weight))
total_weights[LHS] += weight
else:
rules[LHS] = [(RHS, weight)]
total_weights[LHS] = weight
#for all non-comment non-empty lines parse as
#weight LHS <tab> RHS
#rhs may have multiple elements
#creates a new sentence using rewrites
def create_sentence():
sentence = rewrite_node("ROOT")
#pprint.pprint(sentence)
return flatten(sentence)
#takes a list of terminals/nonterminals
#and rewrites them as far down as possible
def rewrite_node(node):
#print "## " + "rewriting sentence " + str(node)
#for i in range(len(sentence)):
#node = sentence[i]
if node in rules.keys(): #nonterminal
#print "## node " + node
#rewrite = random.choice(rules[node])[0]
rewrite = choose_probabilistic(rules[node])
#print "## rewriting " + node + " to " + str(rewrite)
filtered = map(rewrite_node, rewrite)
#print "### filtered:"
#pprint.pprint( filtered)
return filtered
else: #terminal
#print "## terminal " + node
return node
#http://caolanmcmahon.com/posts/flatten_for_python/
#turn an arbitrarily nested list into a linear one
def flatten(l):
return reduce(lambda x,y: x+[y] if type(y) != list else x+flatten(y), l,[])
#list if a list of (item, weight)
def choose_probabilistic(list):
weights = [x[1] for x in list]
i = get_random_weighted_index(weights)
return list[i][0]
#http://eli.thegreenplace.net/2010/01/22/weighted-random-generation-in-python/
#returns an index of the weighted list with prob of each choice = weight/total
def get_random_weighted_index(weights):
totals = []
running_total = 0
for w in weights:
running_total += w
totals.append(running_total)
rnd = random.random() * running_total
for i, total in enumerate(totals):
if rnd < total:
return i
############
##main
grammar_filename = sys.argv[1]
number_of_words = sys.argv[2]
parse_grammar_file(grammar_filename)
#print "# rules"
#pprint.pprint( rules )
#pprint.pprint( total_weights )
for i in range(int(number_of_words)):
sentence = create_sentence()
#print "\n# sentence"
print string.join(sentence, " ")