-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpcyk.py
114 lines (83 loc) · 3.67 KB
/
pcyk.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import sys
from argparse import ArgumentParser
from collections import defaultdict
from tree import Tree
def load_lexicon(filename):
# Platzhalter; hier soll das Lexikon eingelesen werden.
lexicon = defaultdict(list)
with open(filename, 'r') as fl:
for line in fl:
# [word, word\tprob]
split = line.strip().split(" -> ")
left_part = split[0]
split = split[1].split("\t")
right_part = split[0]
prob = split[1]
lexicon[right_part].append((float(prob), left_part))
return lexicon
def load_rules(filename):
rules = defaultdict(lambda: defaultdict(list))
with open(filename, 'r') as fl:
for line in fl:
# [word, word\tprob]
split = line.strip().split(" -> ")
left_terminal = [split[0]][0]
split = split[1].split("\t")
right_terminals = split[0].split()
prob = split[len(split) - 1]
rules[right_terminals[0]][right_terminals[1]].append((float(prob), left_terminal))
return rules
class PCYK():
def __init__(self, rules, lexicon, beam):
self.rules = load_rules(rules)
self.lexicon = load_lexicon(lexicon)
# maximale Anzahl an Konstituenten, die in den Zellen der Chart gespeichert
# werden sollen.
self.beam = beam
def parse(self, tokens):
P = defaultdict(dict) # Wahrscheinlichkeiten
B = defaultdict(dict) # Bäume
for i, wi in enumerate(tokens, start=1):
for prob, lhs in self.lexicon.get(wi, []) or self.lexicon['OOV']: # nimm "OOV" falls wi nicht im Lexikon
P[i - 1, i][lhs] = prob
B[i - 1, i][lhs] = Tree(lhs, [Tree(wi, [])])
for j in range(i - 2, -1, -1):
for k in range(j + 1, i):
for rhs1 in P[j, k]:
for rhs2 in P[k, i]:
for ruleprob, lhs in self.rules[rhs1][rhs2]:
prob = ruleprob + P[j, k][rhs1] + P[k, i][rhs2]
if lhs not in P[j, i] or prob > P[j, i][lhs]:
if len(P[j, i]) > self.beam:
min_key = max(P[j, i], key=P[j, i].get)
if prob < P[j, i][min_key]:
continue
del P[j, i][min_key]
del B[j, i][min_key]
# update Wahrscheinlichkeit
P[j, i][lhs] = prob
# update Pointer / Baum
B[j, i][lhs] = Tree(lhs, [B[j, k][rhs1], B[k, i][rhs2]])
if "S" in P[0, i]:
return (P[0, i]["S"], B[0, i]["S"])
else:
return None
def main(args):
parser = PCYK(args.rules, args.lexicon, args.beam)
with open(args.file) as f:
for tokens in f:
tokens = tokens.split()
if args.maxlen and len(tokens) <= args.maxlen:
result = parser.parse(tokens)
if result:
print(*result, sep='\t')
else:
print('No parse found for "{}"'.format(' '.join(tokens)), file=sys.stderr)
if __name__ == "__main__":
ap = ArgumentParser()
ap.add_argument('--rules', type=str, default='rules.txt')
ap.add_argument('--lexicon', type=str, default='lexicon.txt')
ap.add_argument('--beam', type=int, default=50)
ap.add_argument('--maxlen', type=int, default=25)
ap.add_argument('file')
main(ap.parse_args())