-
Notifications
You must be signed in to change notification settings - Fork 0
/
HMMTagger.py
114 lines (94 loc) · 3.18 KB
/
HMMTagger.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import os
import json
cur_path = os.path.dirname(__file__)
wordTag = {}
wordTagSeq = {}
langlist = ["En", "Hi", "EnDetailed"]
for lang in langlist:
filename = "WordTag" + lang + ".json"
new_path = os.path.join(cur_path, "TrainedData", filename)
with open(new_path, "r") as file:
wordTag[lang] = json.load(file)
filename = "WordTagSeq" + lang + ".json"
new_path = os.path.join(cur_path, "TrainedData", filename)
with open(new_path, "r") as file:
wordTagSeq[lang] = json.load(file)
# Get the emission and transition probability
def getProb(word, prevTag, curTag, langcode):
# Check the transition probability
if curTag not in wordTagSeq[langcode][prevTag].keys():
TrProb = 1
else:
TrProb = wordTagSeq[langcode][prevTag][curTag]
# Check the emission probability
if word not in wordTag.keys():
EmProb = 1
elif curTag not in wordTag[word].keys():
EmProb = 0
else:
EmProb = wordTag[word][curTag]
return EmProb, TrProb
def viterbiAlgorithm(sent, langcode):
bestEdge = {}
bestScore = {}
possibleTags = []
possibleWords = wordTag[langcode].keys()
# Add all the possible tags in a list to check later
for tag in wordTagSeq[langcode]:
for secondTag in wordTagSeq[langcode][tag]:
if secondTag not in possibleTags:
possibleTags.append(tag)
words = sent.split(" ")
words = [word for word in words if len(word) != 0]
words.insert(0, "S")
# Get the best tag for the first word
for tag in possibleTags:
EmProb, TrProb = getProb(words[0], "S", tag, langcode)
bestScore[(words[0], tag, 0)] = EmProb * TrProb
bestEdge[(words[0], tag, 0)] = "S"
for i in range(1, len(words)):
for curTag in possibleTags:
tempScore = 0
if (words[i] in possibleWords) and (
curTag not in wordTag[langcode][words[i]].keys()
):
# If not a possible tag, assign it a probability = 0
bestScore[(words[i], curTag, i)] = tempScore
else:
# If a possible tag, assign it a calculated probability
for prevTag in possibleTags:
EmProb, TrProb = getProb(words[i], prevTag, curTag, langcode)
score = bestScore[(words[i - 1], prevTag, i - 1)] * EmProb * TrProb
bestScore[(words[i], curTag, i)] = tempScore
if score > tempScore:
tempScore = score
bestScore[(words[i], curTag, i)] = score
bestEdge[(words[i], curTag, i)] = prevTag
# Check the best possible tag for the last word
score = 0
bestTag = None
taggedSent = []
nthWord = words[-1]
wordsLength = len(words) - 1
for tag in possibleTags:
if bestScore[(nthWord, tag, wordsLength)] > score:
score = bestScore[(nthWord, tag, wordsLength)]
bestTag = tag
taggedSent.append((nthWord, bestTag))
for i in range(len(words) - 2, -1, -1):
taggedSent.append((words[i], bestEdge[(words[i + 1], bestTag, i + 1)]))
bestTag = bestEdge[(words[i + 1], bestTag, i + 1)]
return taggedSent
def tagSentence(sent, langcode):
taggedSent = viterbiAlgorithm(sent, langcode)
taggedSent = taggedSent[::-1]
taggedSent = taggedSent[1:]
updatedSent = []
for i in range(0, len(taggedSent)):
if taggedSent[i][1] == "S":
updatedSent.append((taggedSent[i][0], "NNP"))
elif taggedSent[i][1] == "":
updatedSent.append((taggedSent[i][0], "X"))
else:
updatedSent.append(taggedSent[i])
return updatedSent