-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmarkov_chain.py
63 lines (54 loc) · 2.13 KB
/
markov_chain.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
from dictogram import Dictogram
from random import randint
class MarkovChain(dict):
"""Creates a var word_freq that's a Dictogram with all the words in my corpus and their frequencies."""
def __init__(self, sentences):
super(MarkovChain, self).__init__() # make itself a Markov Chain.
self.START = "!+-2"
self.END="$$$"
for sentence in sentences:
self.compile(sentence)
# First Order
def compile(self, sentence):
""" Compile the sentence into a list of words . """
words = [self.START] + sentence.split(' ') + [self.END] # Split the sentence into a list of words with START END tokens
for i in range(len(words)-1): # For every word
if words[i] not in self: # If the word is not in our Markov Chain.
self[words[i]] = Dictogram() # Set its key equal to a new Dictogram.
self[words[i]].add_count(words[i+1]) # let's go grab the next word and add it to it's Dictogram value.
def get_next_word(self, dictogram):
'''Random sampling: Picks a random word from histogram containing words and weights
Returning a random word based on the weights.
'''
words, weights = zip(*dictogram.items())
# accumulator is the seperator between weights... #TODO: draw that out.
accumulator, separators = 0, []
for weight in weights:
accumulator += weight
separators.append(accumulator)
rand = randint(0, accumulator)
for index, separator in enumerate(separators):
if rand <= separator:
return words[index]
def make_sentence(self, length=8):
words = [self.get_next_word(self[self.START])]
while words[-1] is not self.END:
words.append(self.get_next_word(self[words[-1]]))
return ' '.join(words[:-1])
# .....Visual.....
# one fish two fish red fish blue fish two shark
# {
# one : {
# fish: 1,
# }
# fish : {
# two : 2,
# red : 1,
# blue : 1
# }
# two :
# red :
# blue :
# shark :
# }
# START STOP TOKENS