-
Notifications
You must be signed in to change notification settings - Fork 0
/
genMarkovDict.py
56 lines (42 loc) · 1.18 KB
/
genMarkovDict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# from nltk.tokenize import RegexpTokenizer
import nltk
import re
import pprint
import random
import sys
import getopt
import glob
import markov
def checkargs():
keyLen = 1
fileList = []
if len(sys.argv) < 3:
print( "Usage: " + sys.argv[0] + " -k <Key lenth> -i <input files> -d <dictionary file> ")
exit(0)
else:
arg = {}
options = getopt.getopt(sys.argv[1:], 'k:i:d:')
for item in options[0]:
if(item):
arg[ item[0] ] = item[1]
# pprint.pprint(arg)
keyLen = int(arg[ '-k'])
dictFile = arg['-d']
wildcardFileList = arg[ '-i'].split(",")
for filePattern in wildcardFileList:
fileList = fileList + glob.glob(filePattern)
return(keyLen, fileList, dictFile)
def main():
(keyLen, fileList, dictFile) = checkargs()
#Create new Markov class
markovObj = markov.Markov(keyLen)
# print(fileList)
for file in fileList:
try:
markovObj.readFile(file, "utf-8")
except:
markovObj.readFile(file, "windows-1252")
markovObj.outputDict(dictFile)
print( "Generated Markov dictionary %s with processing %s input lines and %s input words " % ( dictFile, str(markovObj.getLineCount()), str(markovObj.getWordCount()) ) )
if __name__ == "__main__":
main()